diff --git a/.github/workflows/step_images.yml b/.github/workflows/step_images.yml index 8f9959b8..25c79452 100644 --- a/.github/workflows/step_images.yml +++ b/.github/workflows/step_images.yml @@ -87,7 +87,7 @@ jobs: - name: Push README to registry uses: christian-korneck/update-container-description-action@d36005551adeaba9698d8d67a296bd16fa91f8e8 # v1 - if: (github.ref == 'refs/heads/main' || (github.event_name == 'push' && contains(github.ref, 'refs/tags/'))) && github.repository_owner == 'ceems' # Don't run this workflow on forks. + if: (github.ref == 'refs/heads/main' || (github.event_name == 'push' && contains(github.ref, 'refs/tags/'))) && github.repository_owner == 'ceems-dev' # Don't run this workflow on forks. env: # For dockerhub registry DOCKER_USER: ${{ secrets.login }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b958b49..f11fd5e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,12 @@ # Changelog -## 0.11.0 / 2025-*-* +## 0.11.0 / 2025-* ### Breaking Changes #### CEEMS Exporter +- Collector `rapl` is disabled by default now and to enable it add `--collector.rapl` to CLI arguments. - Collector `ipmi_dcmi` has been renamed to `ipmi` as more functionality beyond DCMI has been added to the collector. - Following metric labels have been renamed to be more consistent with Prometheus naming convention: * `ceems_ipmi_dcmi_current_watts` -> `ceems_ipmi_dcmi_power_current_watts` @@ -17,6 +18,20 @@ * `ceems_redfish_max_watts` -> `ceems_redfish_power_max_watts` * `ceems_redfish_avg_watts` -> `ceems_redfish_power_avg_watts` +#### CEEMS tool + +- The relabel configs generated by subcommand `create-relabel-configs` are obsolete as the relabelling of metrics directly handled inside the recording rules. Please +regenerate recording rules with new version and remove existing relabel configs on Prometheus server. +- Several minor bugs in recording rules have been fixed. Please regenerate the recording rules with new version of `ceems_tool`. +- GPU profiling metrics have been renamed to have `prof` in the metric label. For instance, `uuid:ceems_gpu_sm_active:ratio` became +`uuid:ceems_gpu_prof_sm_active:ratio`. +- NVIDIA profiling metrics suffix has been corrected to use `sum` instead of `ratio` for NVLink, PCIe traffic metrics. Thus, metrics +have been renamed as follows: + * `uuid:ceems_gpu_pcie_tx_bytes:ratio` -> `uuid:ceems_gpu_prof_pcie_tx_bytes:sum` + * `uuid:ceems_gpu_pcie_rx_bytes:ratio` -> `uuid:ceems_gpu_prof_pcie_rx_bytes:sum` + * `uuid:ceems_gpu_nvlink_tx_bytes:ratio` -> `uuid:ceems_gpu_prof_nvlink_tx_bytes:sum` + * `uuid:ceems_gpu_nvlink_rx_bytes:ratio` -> `uuid:ceems_gpu_prof_nvlink_rx_bytes:sum` + ## 0.10.2 / 2025-08-07 - [BUGFIX] Fix bpf code to work with LLVM 20 [#393](https://github.com/mahendrapaipuri/ceems/pull/393) ([@mahendrapaipuri](https://github.com/mahendrapaipuri)) diff --git a/build/package/cacct/postinstall.sh b/build/package/cacct/postinstall.sh index 529d6475..75afb310 100755 --- a/build/package/cacct/postinstall.sh +++ b/build/package/cacct/postinstall.sh @@ -6,13 +6,13 @@ cleanInstall() { printf "\033[32m Post Install of an clean install\033[0m\n" # Step 3 (clean install), setup setgid bit on cacct - chmod g+s /usr/local/bin/cacct + chmod u+s /usr/local/bin/cacct } upgrade() { printf "\033[32m Post Install of an upgrade\033[0m\n" # Step 3(upgrade), setup setgid bit on cacct - chmod g+s /usr/local/bin/cacct + chmod u+s /usr/local/bin/cacct } # Step 2, check if this is a clean install or an upgrade diff --git a/cmd/cacct/api.go b/cmd/cacct/api.go index 55b1364a..3f91debb 100644 --- a/cmd/cacct/api.go +++ b/cmd/cacct/api.go @@ -25,6 +25,7 @@ const ( // stats returns units and usage structs by making requests to CEEMS API server. func stats( + config *Config, currentUser string, start time.Time, end time.Time, @@ -35,15 +36,6 @@ func stats( tsData bool, tsDataOut string, ) ([]models.Unit, []models.Usage, error) { - // By this time, user input is validated. Time to read config file - // to get HTTP config to connect to CEEMS API server. - // Either setuid or setgid bits must be applied on the app so that - // the config file can be read as the owner of this app - config, err := readConfig() - if err != nil { - return nil, nil, fmt.Errorf("failed to read config file: %w", err) - } - // Add user header to HTTP config userHeaders := http_config.Header{ Values: []string{currentUser}, @@ -143,7 +135,8 @@ func stats( return units, usage, nil } - if err := tsdbData(ctx, config, units, tsDataOut); err != nil { + err := tsdbData(ctx, config, units, tsDataOut) + if err != nil { fmt.Fprintln(os.Stderr, "failed to fetch time series data", err) } } @@ -163,7 +156,8 @@ func makeRequest[T any](ctx context.Context, reqURL string, urlValues url.Values req.URL.RawQuery = urlValues.Encode() // Make request - if resp, err := client.Do(req); err != nil { + resp, err := client.Do(req) + if err != nil { return nil, err } else { defer resp.Body.Close() @@ -181,7 +175,9 @@ func makeRequest[T any](ctx context.Context, reqURL string, urlValues url.Values // Unpack into data var data Response[T] - if err = json.Unmarshal(body, &data); err != nil { + + err = json.Unmarshal(body, &data) + if err != nil { return nil, err } diff --git a/cmd/cacct/main.go b/cmd/cacct/main.go index 7061e9f8..3ed8fbfa 100644 --- a/cmd/cacct/main.go +++ b/cmd/cacct/main.go @@ -8,7 +8,9 @@ import ( "os/user" "path/filepath" "slices" + "strconv" "strings" + "syscall" "time" "github.com/alecthomas/kingpin/v2" @@ -301,7 +303,8 @@ func (c *Config) UnmarshalYAML(unmarshal func(any) error) error { type plain Config - if err := unmarshal((*plain)(c)); err != nil { + err := unmarshal((*plain)(c)) + if err != nil { return err } @@ -322,14 +325,16 @@ func (w *WebConfig) UnmarshalYAML(unmarshal func(any) error) error { type plain WebConfig - if err := unmarshal((*plain)(w)); err != nil { + err := unmarshal((*plain)(w)) + if err != nil { return err } // The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer. // We cannot make it a pointer as the parser panics for inlined pointer structs. // Thus we just do its validation here. - if err := w.HTTPClientConfig.Validate(); err != nil { + err = w.HTTPClientConfig.Validate() + if err != nil { return err } @@ -398,7 +403,8 @@ func main() { "markdown", "Produce markdown output (default: false).", ).Default("false").BoolVar(&mdOut) - if _, err := cacctApp.Parse(os.Args[1:]); err != nil { + _, err := cacctApp.Parse(os.Args[1:]) + if err != nil { kingpin.Fatalf("failed to parse CLI flags: %v", err) } @@ -444,25 +450,26 @@ func main() { // Always add started and ended ts fields as we will need them for TSDB data retrieval fields = append(fields, []string{"started_at_ts", "ended_at_ts"}...) - // Ensure --job flag is passed when asking for metric data - // This is to avoid fetching metrics of too many jobs when only - // period is set - if tsData && len(jobs) == 0 { - kingpin.Fatalf("explicit job IDs must be passed using --job when --ts is enabled") - } - // Convert start and end times to time.Time var start, end time.Time - var err error - if start, err = parseTime(startTime); err != nil { + start, err = parseTime(startTime) + if err != nil { kingpin.Fatalf("failed to parse --starttime flag: %v", err) } - if end, err = parseTime(endTime); err != nil { + end, err = parseTime(endTime) + if err != nil { kingpin.Fatalf("failed to parse --endtime flag: %v", err) } + // Ensure to limit period to 1 week asking for metric data + // This is to avoid fetching metrics of too many jobs when only + // period is set + if tsData && end.Sub(start) > 7*24*time.Hour { + kingpin.Fatalf("limit period between --starttime and --endtime to 7 days when --ts is enabled") + } + // Get current user and add user's config dir to slice of config // dirs. // If current user is root and mockCurrentUser and/or mockConfigPath @@ -474,18 +481,56 @@ func main() { } // Check if currentUser is only user in userNames and if so, set userNames to nil - if len(userNames) == 1 && userNames[0] == currentUser { + if len(userNames) == 1 && userNames[0] == currentUser.Username { userNames = nil } + // By this time, user input is validated. Time to read config file + // to get HTTP config to connect to CEEMS API server. + // Either setuid or setgid bits must be applied on the app so that + // the config file can be read as the owner of this app + config, err := readConfig() + if err != nil { + os.Exit(checkErr(fmt.Errorf("failed to read config file: %w", err))) + } + + // Now time to drop privileges so that rest of app will be run as regular user + // who invoked it. It is necessary so to be able to create directories and files + // to user's space. + // The condition ensures that it will be executed only in production and not in e2e + // test cases + if mockCurrentUser == "" && mockConfigPath == "" { + // Convert UID anf GID to int + uid, err := strconv.Atoi(currentUser.Uid) + if err != nil { + os.Exit(checkErr(fmt.Errorf("failed to get current user uid: %w", err))) + } + + gid, err := strconv.Atoi(currentUser.Gid) + if err != nil { + os.Exit(checkErr(fmt.Errorf("failed to get current user gid: %w", err))) + } + + // Set UID and GID to current user + err = syscall.Setuid(uid) + if err != nil { + os.Exit(checkErr(fmt.Errorf("failed to set current user uid: %w", err))) + } + + err = syscall.Setgid(gid) + if err != nil { + os.Exit(checkErr(fmt.Errorf("failed to set current user gid: %w", err))) + } + } + // Get stats - units, usages, err := stats(currentUser, start, end, accounts, jobs, userNames, fields, tsData, tsDataOut) + units, usages, err := stats(config, currentUser.Username, start, end, accounts, jobs, userNames, fields, tsData, tsDataOut) if err != nil { os.Exit(checkErr(err)) } // Print stats as table - t := newTable(currentUser, userNames, units, usages) + t := newTable(currentUser.Username, userNames, units, usages) // Based on request rendering format switch { @@ -642,14 +687,17 @@ func readConfig() (*Config, error) { for _, configPath := range configPaths { for _, file := range []string{"config.yml", "config.yaml", "cacct.yml", "cacct.yaml"} { configFile := filepath.Join(configPath, file) - if _, err := os.Stat(configFile); err == nil { + + _, err := os.Stat(configFile) + if err == nil { // Read config file cfg, err := os.ReadFile(configFile) if err != nil { return nil, err } - if err = yaml.Unmarshal(cfg, &config); err != nil { + err = yaml.Unmarshal(cfg, &config) + if err != nil { return nil, err } @@ -663,32 +711,34 @@ func readConfig() (*Config, error) { // getCurrentUser returns the actual user executing the cacct. If --current-user // CLI flag is passed, that user will be returned as current user. -func getCurrentUser(mockUserName string, mockConfigPath string) (string, error) { +func getCurrentUser(mockUserName string, mockConfigPath string) (*user.User, error) { // Get current user is who is executing cacct - var currentUser string + var currentUser *user.User - if u, err := user.Current(); err != nil { - return "", fmt.Errorf("failed to get current user: %w", err) + // Get effective UID as cacct is a setuid binary + u, err := user.Current() + if err != nil { + return nil, fmt.Errorf("failed to get current user: %w", err) } else { // Check if mockUserName is set. This will be always empty string // for production builds as we do not compile flags for production // builds if mockUserName != "" { - currentUser = mockUserName + currentUser = &user.User{Username: mockUserName} // If mockConfigPath is set as well, add to configPaths if mockConfigPath != "" { configPaths = append(configPaths, mockConfigPath) } } else { - currentUser = u.Name + currentUser = u } } // Add user HOME to configPaths userConfigDir, err := os.UserConfigDir() if err != nil { - return "", fmt.Errorf("failed to get config file: %w", err) + return nil, fmt.Errorf("failed to get config file: %w", err) } configPaths = append(configPaths, filepath.Join(userConfigDir, "ceems")) @@ -698,17 +748,20 @@ func getCurrentUser(mockUserName string, mockConfigPath string) (string, error) func parseTime(s string) (time.Time, error) { // First attempt is to parse as YYYY-MM-DDTHH:MM:SS - if t, err := time.Parse("2006-01-02T15:04:05", s); err == nil { + t, err := time.Parse("2006-01-02T15:04:05", s) + if err == nil { return t.In(time.Local), nil } // Second attempt is to parse as YYYY-MM-DDTHH:MM - if t, err := time.Parse("2006-01-02T15:04", s); err == nil { + t, err = time.Parse("2006-01-02T15:04", s) + if err == nil { return t.In(time.Local), nil } // Third attempt is to parse as YYYY-MM-DD - if t, err := time.Parse("2006-01-02", s); err == nil { + t, err = time.Parse("2006-01-02", s) + if err == nil { return t.In(time.Local), nil } diff --git a/cmd/cacct/testdata/output/e2e-test-cacct-tsdata-fail.txt b/cmd/cacct/testdata/output/e2e-test-cacct-tsdata-fail.txt index 725bc411..1e7007e4 100644 --- a/cmd/cacct/testdata/output/e2e-test-cacct-tsdata-fail.txt +++ b/cmd/cacct/testdata/output/e2e-test-cacct-tsdata-fail.txt @@ -1 +1 @@ -cacct: error: explicit job IDs must be passed using --job when --ts is enabled +cacct: error: limit period between --starttime and --endtime to 7 days when --ts is enabled diff --git a/cmd/cacct/testdata/output/e2e-test-cacct-tsdata.txt b/cmd/cacct/testdata/output/e2e-test-cacct-tsdata.txt index 2255dcb7..132bc8ab 100644 --- a/cmd/cacct/testdata/output/e2e-test-cacct-tsdata.txt +++ b/cmd/cacct/testdata/output/e2e-test-cacct-tsdata.txt @@ -20,6 +20,7 @@ { "fingerprint": "554b56cadf9dea4b", "labels": { + "__name__": "cpu_usage", "instance": "localhost:9090", "uuid": "147973" } diff --git a/cmd/cacct/tsdb.go b/cmd/cacct/tsdb.go index 90a173ad..5f10e1e9 100644 --- a/cmd/cacct/tsdb.go +++ b/cmd/cacct/tsdb.go @@ -41,7 +41,8 @@ func tsdbData(ctx context.Context, config *Config, units []models.Unit, outDir s } // Create outDir for saving CSV files - if err := os.MkdirAll(outDir, 0o700); err != nil { + err = os.MkdirAll(outDir, 0o700) + if err != nil { return fmt.Errorf("failed to create directory for saving CSV files: %w", err) } @@ -50,11 +51,11 @@ func tsdbData(ctx context.Context, config *Config, units []models.Unit, outDir s // Fetch time series of each metric in separate go routine for _, unit := range units { - for _, query := range config.TSDB.Queries { + for queryID, query := range config.TSDB.Queries { wg.Add(1) // Fetch metrics from TSDB and write to CSV files - go fetchData(ctx, fmt.Sprintf(query, unit.UUID), unit.StartedAtTS, unit.EndedAtTS, outDir, tsdb, &wg) + go fetchData(ctx, queryID, fmt.Sprintf(query, unit.UUID), unit.StartedAtTS, unit.EndedAtTS, outDir, tsdb, &wg) } } @@ -70,7 +71,7 @@ func tsdbData(ctx context.Context, config *Config, units []models.Unit, outDir s } // fetchData retrieves time series data from TSDB. -func fetchData(ctx context.Context, query string, start int64, end int64, outDir string, tsdb *tsdb.Client, wg *sync.WaitGroup) { +func fetchData(ctx context.Context, queryID string, query string, start int64, end int64, outDir string, tsdb *tsdb.Client, wg *sync.WaitGroup) { defer wg.Done() // Make a range query @@ -89,10 +90,18 @@ func fetchData(ctx context.Context, query string, start int64, end int64, outDir // Get fingerprint fp := result.Metric.Fingerprint().String() + // Get labels + labels := result.Metric + + // Replace series name in labels with queryID + // This is more readable one and also allows us + // to protect Prometheus series names + labels["__name__"] = model.LabelValue(queryID) + // Add metadata of query md = append(md, queryMetadata{ Fingerprint: fp, - Labels: result.Metric, + Labels: labels, }) // Create file name based on fingerprint @@ -108,13 +117,15 @@ func fetchData(ctx context.Context, query string, start int64, end int64, outDir defer f.Close() // Write header - if err := writer.Write([]string{"timestamp", "value"}); err != nil { + err = writer.Write([]string{"timestamp", "value"}) + if err != nil { fmt.Fprintln(os.Stderr, "failed to write header:", err, "file:", csvFilepath) } // Write records for _, value := range result.Values { - if err := writer.Write([]string{value.Timestamp.String(), value.Value.String()}); err != nil { + err := writer.Write([]string{value.Timestamp.String(), value.Value.String()}) + if err != nil { fmt.Fprintln(os.Stderr, "failed to write data:", err, "file:", csvFilepath) } } @@ -137,19 +148,31 @@ func fetchData(ctx context.Context, query string, start int64, end int64, outDir // writeMetadata dumps the metadata.json file to outDir. func writeMetadata(mds []queryMetadata, outDir string) { + metadataFilepath := filepath.Join(outDir, "metadata.json") + + // Read existing metadata + content, err := os.ReadFile(metadataFilepath) + if err == nil { + var existingMD []queryMetadata + + err := json.Unmarshal(content, &existingMD) + if err == nil { + mds = append(mds, existingMD...) + } + } + // Dump metadata json buffer := new(bytes.Buffer) encoder := json.NewEncoder(buffer) encoder.SetIndent("", "\t") - if err := encoder.Encode(mds); err != nil { + err = encoder.Encode(mds) + if err != nil { fmt.Fprintln(os.Stderr, "failed to encode metadata", "err:", err) return } - metadataFilepath := filepath.Join(outDir, "metadata.json") - file, err := os.OpenFile(metadataFilepath, os.O_RDWR|os.O_CREATE, 0o600) if err != nil { fmt.Fprintln(os.Stderr, "failed to create metadata.json file", "err:", err) @@ -159,7 +182,8 @@ func writeMetadata(mds []queryMetadata, outDir string) { defer file.Close() - if _, err := file.Write(buffer.Bytes()); err != nil { + _, err = file.Write(buffer.Bytes()) + if err != nil { fmt.Fprintln(os.Stderr, "failed to write content to metadata.json file", "err:", err) return diff --git a/cmd/ceems_tool/relabel.go b/cmd/ceems_tool/relabel.go index ca8ca88a..fdc2de1f 100644 --- a/cmd/ceems_tool/relabel.go +++ b/cmd/ceems_tool/relabel.go @@ -8,6 +8,7 @@ import ( "net/url" "os" "slices" + "strings" "github.com/prometheus/common/model" "gopkg.in/yaml.v3" @@ -16,7 +17,7 @@ import ( var gpuSeries = []string{ "DCGM_FI_DEV_POWER_USAGE_INSTANT", "amd_gpu_power", - "gpu_power_usage", + "gpu_package_power", } // MetricRelabelConfig contains the Prometheus metric relabel config. @@ -151,37 +152,39 @@ func CreatePromRelabelConfig( Action: "labeldrop", }, } - case slices.Contains(jobSeries[job], model.LabelValue(gpuSeries[2])): - // Just like DCGM exporter, AMD device metrics exporter - // exports GPU index as gpu_id and GPU partition ID as - // gpu_partition_id. We will relabel them to match the - // CEEMS exporter - relabelConfigs = []MetricRelabelConfig{ - { - SourceLabels: []string{"gpu_id"}, - TargetLabel: "index", - Regex: "(.*)", - Replacement: "$1", - Action: "replace", - }, - { - SourceLabels: []string{"gpu_partition_id"}, - TargetLabel: "gpuiid", - Regex: "(.*)", - Replacement: "$1", - Action: "replace", - }, - { - Regex: "gpu_id", - Action: "labeldrop", - }, - { - Regex: "gpu_partition_id", - Action: "labeldrop", - }, - } default: - continue + for _, metricName := range jobSeries[job] { + if strings.Contains(string(metricName), gpuSeries[2]) { + // Just like DCGM exporter, AMD device metrics exporter + // exports GPU index as gpu_id and GPU partition ID as + // gpu_partition_id. We will relabel them to match the + // CEEMS exporter + relabelConfigs = []MetricRelabelConfig{ + { + SourceLabels: []string{"gpu_id"}, + TargetLabel: "index", + Regex: "(.*)", + Replacement: "$1", + Action: "replace", + }, + { + SourceLabels: []string{"gpu_partition_id"}, + TargetLabel: "gpuiid", + Regex: "(.*)", + Replacement: "$1", + Action: "replace", + }, + { + Regex: "gpu_id", + Action: "labeldrop", + }, + { + Regex: "gpu_partition_id", + Action: "labeldrop", + }, + } + } + } } scrapeConfigs = append(scrapeConfigs, ScrapeConfig{Job: job, RelabelConfigs: relabelConfigs}) @@ -189,10 +192,12 @@ func CreatePromRelabelConfig( // Encode to YAML with indent set to 2 var b bytes.Buffer + yamlEncoder := yaml.NewEncoder(&b) yamlEncoder.SetIndent(2) - if err := yamlEncoder.Encode(&PromConfig{ScrapeConfigs: scrapeConfigs}); err != nil { + err = yamlEncoder.Encode(&PromConfig{ScrapeConfigs: scrapeConfigs}) + if err != nil { fmt.Fprintln(os.Stderr, "error encoding scrape_configs", err) return err @@ -201,5 +206,11 @@ func CreatePromRelabelConfig( fmt.Fprintln(os.Stderr, "Merge the following scrape_configs with the current config.") fmt.Fprintln(os.Stderr, b.String()) + // Starting from v0.11.0, there is no need to set up these relabel configs as recording rules + // will take care of it. Inform that to users + fmt.Fprintln(os.Stderr, "WARNING:") + fmt.Fprintln(os.Stderr, "Starting from v0.11.0 the relabelling of metrics is handled by recording rules generated by ceems_tool") + fmt.Fprintln(os.Stderr, "There is no need to add the generated relabel_configs to Prometheus' scrape configs") + return nil } diff --git a/cmd/ceems_tool/rules.go b/cmd/ceems_tool/rules.go index 4b13a77c..091bc79c 100644 --- a/cmd/ceems_tool/rules.go +++ b/cmd/ceems_tool/rules.go @@ -31,6 +31,13 @@ const ( ipmiPowerMetric = "ceems_ipmi_dcmi_power_current_watts" redfishPowerMetric = "ceems_redfish_power_current_watts" crayPowerMetric = "ceems_cray_pm_counters_power_watts" + hwmonPowerMetric = "ceems_hwmon_power_current_watts" +) + +const ( + dcgmPowerMetric = "DCGM_FI_DEV_POWER_USAGE_INSTANT" + amdSMIPowerMetric = "amd_gpu_power" + amdDevExporterPkgPowerMetric = "gpu_package_power" ) var ( @@ -42,10 +49,11 @@ var ( ipmiPowerMetric, redfishPowerMetric, crayPowerMetric, + hwmonPowerMetric, "ceems_emissions_gCo2_kWh", - "DCGM_FI_DEV_POWER_USAGE_INSTANT", - "amd_gpu_power", - "gpu_power_usage", // AMD metrics device exporter + dcgmPowerMetric, + amdSMIPowerMetric, + amdDevExporterPkgPowerMetric, // AMD metrics device exporter "ceems_compute_unit_gpu_index_flag", "ceems_compute_unit_gpu_sm_count", } @@ -64,6 +72,19 @@ var ( "DCGM_FI_PROF_PCIE_TX_BYTES", "DCGM_FI_PROF_PCIE_RX_BYTES", } + + amdDevProfSeriesNames = []string{ + "gpu_prof_sm_active", + "gpu_prof_occupancy_elapsed", + "gpu_prof_occupancy_per_active_cu", + "gpu_prof_tensor_active_percent", + "gpu_prof_occupancy_percent", + "gpu_prof_total_16_ops", + "gpu_prof_total_32_ops", + "gpu_prof_total_64_ops", + "gpu_prof_write_size", + "gpu_prof_fetch_size", + } ) // Config represents Prometheus config. @@ -75,12 +96,11 @@ type Config struct { } type gpuTemplateData struct { - templateFile string - powerSeries model.LabelValue - powerScaler int64 - powerInHostPower bool - job model.LabelValue - nvProfSeries model.LabelValues + templateFile string + metricPrefix string + job model.LabelValue + nvProfSeries model.LabelValues + amdProfSeries model.LabelValues } type EmissionFactor struct { @@ -92,40 +112,24 @@ type EmissionFactor struct { type rulesTemplateData struct { GPU *gpuTemplateData TemplateFile string + HostPowerQuery string HostPowerSeries string RAPLAvailable bool Job model.LabelValue PUE float64 EmissionFactor EmissionFactor Providers model.LabelValues - Chassis model.LabelValue CountryCode string RateInterval string EvaluationInterval string } -func (t *rulesTemplateData) GPUPowerInHostPower() bool { - if t.GPU == nil { - return false - } - - return t.GPU.powerInHostPower -} - -func (t *rulesTemplateData) GPUPowerSeries() model.LabelValue { +func (t *rulesTemplateData) GPUMetricPrefix() string { if t.GPU == nil { return "" } - return t.GPU.powerSeries -} - -func (t *rulesTemplateData) GPUPowerScaler() int64 { - if t.GPU == nil { - return 1 - } - - return t.GPU.powerScaler + return t.GPU.metricPrefix } func (t *rulesTemplateData) GPUJob() model.LabelValue { @@ -144,6 +148,14 @@ func (t *rulesTemplateData) NVProfSeries() model.LabelValues { return t.GPU.nvProfSeries } +func (t *rulesTemplateData) AMDProfSeries() model.LabelValues { + if t.GPU == nil { + return nil + } + + return t.GPU.amdProfSeries +} + // CreatePromRecordingRules generates CEEMS specific recording rules for Prometheus. func CreatePromRecordingRules( ctx context.Context, @@ -205,8 +217,10 @@ func CreatePromRecordingRules( // static OWID data providers, err = efProviders(ctx, api, stime, etime, countryCode, disableProviders) if err != nil { - if owid, err := emissions.NewOWIDProvider(slog.New(slog.DiscardHandler)); err == nil { - if owidData, err := owid.Update(); err == nil { + owid, err := emissions.NewOWIDProvider(slog.New(slog.DiscardHandler)) + if err == nil { + owidData, err := owid.Update() + if err == nil { emissionFactor = EmissionFactor{Provider: "owid", Value: owidData[countryCode].Factor} fmt.Fprintln(os.Stderr, "static emission factor", emissionFactor.Value, "g/kWh from OWID data will be used") @@ -218,7 +232,9 @@ func CreatePromRecordingRules( } // Get necessary job meta data - activeJobs, jobSeries, gpuJobMap, err := jobSeriesMetaData(ctx, api, stime, etime, append(seriesNames, nvidiaProfSeriesNames...)) + series := append(seriesNames, append(nvidiaProfSeriesNames, amdDevProfSeriesNames...)...) + + activeJobs, jobSeries, gpuJobMap, err := jobSeriesMetaData(ctx, api, stime, etime, series) if err != nil { fmt.Fprintln(os.Stderr, "error fetching series label values:", err) @@ -226,11 +242,15 @@ func CreatePromRecordingRules( } // Assert prof series into model.Values - var nvProfSeries model.LabelValues + var nvProfSeries, amdProfSeries model.LabelValues for _, s := range nvidiaProfSeriesNames { nvProfSeries = append(nvProfSeries, model.LabelValue(s)) } + for _, s := range amdDevProfSeriesNames { + amdProfSeries = append(amdProfSeries, model.LabelValue(s)) + } + // Create a new template and output director tmpl, err := newTemplate(outDir) if err != nil { @@ -251,11 +271,14 @@ func CreatePromRecordingRules( tmplFile = "cpu-cray.rules" hostPowerSeries = crayPowerMetric case slices.Contains(jobSeries[job], redfishPowerMetric): - tmplFile = "cpu-ipmi-redfish.rules" + tmplFile = "cpu-ipmi-redfish-hwmon.rules" hostPowerSeries = redfishPowerMetric case slices.Contains(jobSeries[job], ipmiPowerMetric): - tmplFile = "cpu-ipmi-redfish.rules" + tmplFile = "cpu-ipmi-redfish-hwmon.rules" hostPowerSeries = ipmiPowerMetric + case slices.Contains(jobSeries[job], hwmonPowerMetric): + tmplFile = "cpu-ipmi-redfish-hwmon.rules" + hostPowerSeries = hwmonPowerMetric case slices.Contains(jobSeries[job], "ceems_rapl_package_joules_total"): tmplFile = "cpu-rapl.rules" hostPowerSeries = "ceems_rapl_package_joules_total" @@ -266,75 +289,67 @@ func CreatePromRecordingRules( fmt.Fprintln(os.Stderr, "generating recording rules for job", job, "in file", job+".rules") // For redfish power usage counter, get all the possible chassis - var targetChassis model.LabelValue + var hostPowerLabelName, hostPowerLabel string - var hostPowerLabel string + switch hostPowerSeries { + case redfishPowerMetric: + hostPowerLabelName = "chassis" - if hostPowerSeries == redfishPowerMetric { - matcher := fmt.Sprintf(`%s{job="%s"}`, redfishPowerMetric, job) - - chassis, _, err := api.LabelValues(ctx, "chassis", []string{matcher}, stime, etime) // Ignoring warnings for now. + targetChassis, err := findTargetLabel(ctx, api, redfishPowerMetric, hostPowerLabelName, job, stime, etime) if err != nil { - fmt.Fprintln(os.Stderr, "job:", job, "error fetching redfish chassis values:", err) + fmt.Fprintln(os.Stderr, "job:", job, "error fetching redfish target chassis values:", err) return err } - // If there are more than 1 chassis, emit log for operators to tell them to - // choose appropriate chassis to get CPU power usage - if len(chassis) > 1 { - fmt.Fprintln(os.Stderr, "Multiple chassis found for", redfishPowerMetric, "for job", job) - fmt.Fprintln(os.Stderr, "Choose the chassis that reports host power usage") - - for ichas, chas := range chassis { - msg := fmt.Sprintf("[%d]: %s", ichas, chas) - fmt.Fprintln(os.Stderr, msg) + // If targetChassis is found, set up label + if targetChassis != nil { + if len(targetChassis) > 1 { + hostPowerLabel = fmt.Sprintf(",%s=~\"%s\"", hostPowerLabelName, strings.Join(targetChassis, "|")) + } else { + hostPowerLabel = fmt.Sprintf(",%s=\"%s\"", hostPowerLabelName, targetChassis[0]) } + } + case hwmonPowerMetric: + hostPowerLabelName = "chip" - // Read input from user - var input string - - fmt.Fprintln(os.Stderr, "Enter number between 0 and", len(chassis)-1) - - if _, err = fmt.Scanln(&input); err != nil { - fmt.Fprintln(os.Stderr, "failed to scan user input:", err) - - return err - } + targetChips, err := findTargetLabel(ctx, api, hwmonPowerMetric, hostPowerLabelName, job, stime, etime) + if err != nil { + fmt.Fprintln(os.Stderr, "job:", job, "error fetching hwmon target chip values:", err) - // Convert user response to int - idx, err := strconv.Atoi(input) - if err != nil { - fmt.Fprintln(os.Stderr, "invalid user input:", err) + return err + } - return err + // If targetChassis is found, set up label + if targetChips != nil { + if len(targetChips) > 1 { + hostPowerLabel = fmt.Sprintf(",%s=~\"%s\"", hostPowerLabelName, strings.Join(targetChips, "|")) + } else { + hostPowerLabel = fmt.Sprintf(",%s=\"%s\"", hostPowerLabelName, targetChips[0]) } + } - // Check whether user input is valid - if idx >= len(chassis) { - fmt.Fprintln(os.Stderr, "user input out of range. Must be between 0 and", len(chassis)-1) + // Overwrite chip to sensor as one chip can have multiple sensors and we need to sum over all of them + hostPowerLabelName = "sensor" + } - return errors.New("user input out of range") - } + // Host power query + var hostPowerQuery string - targetChassis = chassis[idx] - } else if len(chassis) == 1 { - targetChassis = chassis[0] - } else { - fmt.Fprintln(os.Stderr, "no chassis found for", redfishPowerMetric, "for job", job) + if hostPowerLabel != "" { + hostPowerQuery = fmt.Sprintf(`sum without (%s) (%s{job="%s"%s})`, hostPowerLabelName, hostPowerSeries, job, hostPowerLabel) + } else { + hostPowerQuery = fmt.Sprintf(`%s{job="%s"%s}`, hostPowerSeries, job, hostPowerLabel) + } - return fmt.Errorf("no chassis found for %s", redfishPowerMetric) - } + var gpu *gpuTemplateData - // If targetChassis is found, set up label - if targetChassis != "" { - hostPowerLabel = fmt.Sprintf(",chassis=\"%s\"", targetChassis) - } + // Check if GPUs are present on the hosts and get GPU related template data if there + // is a GPU job corresponding to current job + if gpuJob, ok := gpuJobMap[job]; ok { + gpu, hostPowerQuery = gpuData(ctx, api, stime, etime, hostPowerQuery, job, gpuJob, nvProfSeries, amdProfSeries, jobSeries) } - // Check if GPUs are present on the hosts and get GPU related template data - gpu := gpuData(ctx, api, stime, etime, hostPowerSeries, hostPowerLabel, job, nvProfSeries, gpuJobMap, jobSeries) - // Use a rate interval that is atleast 4 times of scrape interval rateInterval := 4 * time.Duration(config.Global.ScrapeInterval) if val, ok := jobScrapeIntervals[string(job)]; ok { @@ -345,10 +360,10 @@ func CreatePromRecordingRules( tmplData := &rulesTemplateData{ GPU: gpu, TemplateFile: tmplFile, + HostPowerQuery: hostPowerQuery, HostPowerSeries: hostPowerSeries, RAPLAvailable: slices.Contains(jobSeries[job], "ceems_rapl_package_joules_total") && slices.Contains(jobSeries[job], "ceems_rapl_dram_joules_total"), Job: job, - Chassis: targetChassis, PUE: pueValue, EmissionFactor: emissionFactor, Providers: providers, @@ -358,7 +373,8 @@ func CreatePromRecordingRules( } // Render templates - if err := renderRules(tmpl, tmplData, outDir); err != nil { + err := renderRules(tmpl, tmplData, outDir) + if err != nil { fmt.Fprintln(os.Stderr, "job:", job, "error executing rules template:", err) continue @@ -413,8 +429,15 @@ func efProviders(ctx context.Context, api v1.API, start time.Time, end time.Time // jobSeriesMetaData returns necessary metadata related to Prom job's series. func jobSeriesMetaData(ctx context.Context, api v1.API, start time.Time, end time.Time, series []string) (model.LabelValues, map[model.LabelValue]model.LabelValues, map[model.LabelValue]model.LabelValue, error) { + // We might not have exact series names so make them regex matchable + seriesMatches := make([]string, len(series)) + + for is, s := range series { + seriesMatches[is] = fmt.Sprintf(`{__name__=~"(.*)%s(.*)"}`, s) + } + // Run query to get matching series. - foundSeries, _, err := api.Series(ctx, series, start, end) // Ignoring warnings for now. + foundSeries, _, err := api.Series(ctx, seriesMatches, start, end) // Ignoring warnings for now. if err != nil { return nil, nil, nil, err } @@ -442,6 +465,14 @@ func jobSeriesMetaData(ctx context.Context, api v1.API, start time.Time, end tim seriesJobs[s["__name__"]] = append(seriesJobs[s["__name__"]], s["job"]) } + // A special case for AMD device metrics exporter where metric label can have + // variable prefix + if strings.Contains(string(s["__name__"]), amdDevExporterPkgPowerMetric) { + if !slices.Contains(seriesJobs[amdDevExporterPkgPowerMetric], s["job"]) { + seriesJobs[amdDevExporterPkgPowerMetric] = append(seriesJobs[amdDevExporterPkgPowerMetric], s["job"]) + } + } + if !slices.Contains(activeJobs, s["job"]) { activeJobs = append(activeJobs, s["job"]) } @@ -455,7 +486,7 @@ func jobSeriesMetaData(ctx context.Context, api v1.API, start time.Time, end tim for _, cpuJob := range seriesJobs["ceems_compute_unit_gpu_index_flag"] { // Look for NVIDIA GPU associations - for _, gpuJob := range seriesJobs["DCGM_FI_DEV_POWER_USAGE_INSTANT"] { + for _, gpuJob := range seriesJobs[dcgmPowerMetric] { // If job instances between CEEMS job and GPU job matches, we mark it as an association if foundInstances := intersection(jobInstances[gpuJob], jobInstances[cpuJob]); len(foundInstances) > 0 { gpuJobsMap[cpuJob] = gpuJob @@ -463,7 +494,7 @@ func jobSeriesMetaData(ctx context.Context, api v1.API, start time.Time, end tim } // Look for AMD GPU associations with AMD SMI exporter - for _, gpuJob := range seriesJobs["amd_gpu_power"] { + for _, gpuJob := range seriesJobs[amdSMIPowerMetric] { // If job instances between CEEMS job and GPU job matches, we mark it as an association if foundInstances := intersection(jobInstances[gpuJob], jobInstances[cpuJob]); len(foundInstances) > 0 { gpuJobsMap[cpuJob] = gpuJob @@ -471,7 +502,7 @@ func jobSeriesMetaData(ctx context.Context, api v1.API, start time.Time, end tim } // Look for AMD GPU associations with AMD device metrics exporter - for _, gpuJob := range seriesJobs["gpu_power_usage"] { + for _, gpuJob := range seriesJobs[amdDevExporterPkgPowerMetric] { // If job instances between CEEMS job and GPU job matches, we mark it as an association if foundInstances := intersection(jobInstances[gpuJob], jobInstances[cpuJob]); len(foundInstances) > 0 { gpuJobsMap[cpuJob] = gpuJob @@ -491,6 +522,9 @@ func newTemplate(outDir string) (*template.Template, error) { "Split": func(s, sep string) []string { return strings.Split(s, sep) }, + "IsSubString": func(s, sub string) bool { + return strings.Contains(s, sub) + }, "Contains": func(s model.LabelValues, e string) bool { return slices.Contains(s, model.LabelValue(e)) }, @@ -506,7 +540,8 @@ func newTemplate(outDir string) (*template.Template, error) { } // Make directory to store recording rules files - if err := os.MkdirAll(outDir, 0o700); err != nil { + err = os.MkdirAll(outDir, 0o700) + if err != nil { fmt.Fprintln(os.Stderr, "error creating output directory:", err) return nil, err @@ -515,86 +550,202 @@ func newTemplate(outDir string) (*template.Template, error) { return tmpl, nil } +// findTargetLabel returns the target label when multiple labels found on metric. +func findTargetLabel(ctx context.Context, api v1.API, metricName string, labelName string, job model.LabelValue, stime time.Time, etime time.Time) ([]string, error) { + matcher := fmt.Sprintf(`%s{job="%s"}`, metricName, job) + + labels, _, err := api.LabelValues(ctx, labelName, []string{matcher}, stime, etime) // Ignoring warnings for now. + if err != nil { + fmt.Fprintln(os.Stderr, "job:", job, "error fetching", labelName, "values:", err) + + return nil, err + } + + var targetLabels []string + + // If there are more than 1 chassis, emit log for operators to tell them to + // choose appropriate chassis to get CPU power usage + if len(labels) > 1 { + fmt.Fprintln(os.Stderr, "Multiple", labelName, "found for", metricName, "for job", job) + fmt.Fprintln(os.Stderr, "Choose the", labelName, "that reports host power usage") + + for ichas, chas := range labels { + msg := fmt.Sprintf("[%d]: %s", ichas, chas) + fmt.Fprintln(os.Stderr, msg) + } + + // Read input from user + var inputs string + + fmt.Fprintln(os.Stderr, "Enter number(s) between 0 and", len(labels)-1) + fmt.Fprintln(os.Stderr, "Multiple labels can be selected by using comma separated list of numbers, e.g., 0,1") + + _, err = fmt.Scanln(&inputs) + if err != nil { + fmt.Fprintln(os.Stderr, "failed to scan user input:", err) + + return nil, err + } + + for input := range strings.SplitSeq(inputs, ",") { + // Convert user response to int + idx, err := strconv.Atoi(input) + if err != nil { + fmt.Fprintln(os.Stderr, "invalid user input:", err) + + return nil, err + } + + // Check whether user input is valid + if idx >= len(labels) { + fmt.Fprintln(os.Stderr, "user input out of range. Must be between 0 and", len(labels)-1) + + return nil, errors.New("user input out of range") + } + + targetLabels = append(targetLabels, string(labels[idx])) + } + } else if len(labels) == 1 { + targetLabels = []string{string(labels[0])} + } else { + fmt.Fprintln(os.Stderr, "no", labelName, "found for", metricName, "for job", job) + + return nil, fmt.Errorf("no %s found for %s", labelName, redfishPowerMetric) + } + + return targetLabels, nil +} + // gpuData returns the template related data for GPUs. func gpuData( ctx context.Context, api v1.API, stime time.Time, etime time.Time, - hostPowerSeries string, - hostPowerLabel string, + hostPowerQuery string, job model.LabelValue, + gpuJob model.LabelValue, nvProfSeries model.LabelValues, - gpuJobMap map[model.LabelValue]model.LabelValue, + amdProfSeries model.LabelValues, jobSeries map[model.LabelValue]model.LabelValues, -) *gpuTemplateData { - // If there is no GPUs on the instances of current job, return - if _, ok := gpuJobMap[job]; !ok { - return nil - } +) (*gpuTemplateData, string) { + var hostPowerOnlyQuery string - // Get GPU job name associated with current job + // Instantiate GPU template data gpu := &gpuTemplateData{ - job: gpuJobMap[job], + job: gpuJob, } // Based on GPU type get Get GPU power series name and template file name + // + // Get labels of unique devices on each node. In case of NVIDIA GPU partitions, power consumption + // metric will be duplicated for each partition and so we should only take into account + // the power consumption of physical devices. + // + // However, in the case of AMD GPUs, power usage is reported only for first partition + // and rest of partitions will have power usage reported as zero. + // + // Also, we noticed that in the case of AMD GPUs, device metrics exporter when deployed + // using GPU operator, it reported gpu_power_usage as zero for all GPUs where as + // gpu_package_power reported correct GPU power usage. However, when device metrics + // exporter is installed via system package manager such as apt, gpu_power_usage + // was reporting correct power usage. + // AMD device exporter allows to add a prefix to metric names and we should figure + // out that prefix as well for the recording rules. switch { - case slices.Contains(jobSeries[gpu.job], "DCGM_FI_DEV_POWER_USAGE_INSTANT"): - gpu.powerSeries = "DCGM_FI_DEV_POWER_USAGE_INSTANT" - gpu.powerScaler = 1 + case slices.Contains(jobSeries[gpu.job], amdSMIPowerMetric): + gpu.templateFile = "gpu-amd-smi.rules" + + // Host power query assuming GPU power is in host power + // We dont know if AMD SMI exporter duplicates power consumption for all partitions or reports + // usage only for first partiition and rest as zero like in AMD device metrics exporter. We assume + // the behaviour is same as the AMD device exporter for the moment. + hostPowerOnlyQuery = fmt.Sprintf( + `(%s - on (hostname) group_left () sum by (hostname) (label_replace(sum by (hostname) (%s{job="%s"}) / 1e6, "hostname", "$1", "instance","([^:]+):\\d+")))`, + hostPowerQuery, amdSMIPowerMetric, gpu.job, + ) + case slices.Contains(jobSeries[gpu.job], dcgmPowerMetric): gpu.templateFile = "gpu-nvidia.rules" + // Host power query assuming GPU power is in host power + hostPowerOnlyQuery = fmt.Sprintf( + `(%s - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(%s{job="%s"}, "hostname", "$1", "Hostname","(.*)"))))`, + hostPowerQuery, dcgmPowerMetric, gpu.job, + ) + // For NVIDIA GPUs check if prof metrics are available gpu.nvProfSeries = intersection(jobSeries[gpu.job], nvProfSeries) - case slices.Contains(jobSeries[gpu.job], "gpu_power_usage"): - gpu.powerSeries = "gpu_power_usage" - gpu.powerScaler = 1 - gpu.templateFile = "gpu-amd-device-metrics.rules" default: - gpu.powerSeries = "amd_gpu_power" - gpu.powerScaler = 1e6 - gpu.templateFile = "gpu-amd-smi.rules" + gpu.templateFile = "gpu-amd-device-metrics.rules" + + // Default case is that we are using AMD device metrics exporter. In this case, first we need + // to figure out metric prefix if there is any + for _, metric := range jobSeries[gpu.job] { + if strings.Contains(string(metric), amdDevExporterPkgPowerMetric) { + if p := strings.Split(string(metric), amdDevExporterPkgPowerMetric); len(p) == 2 { + gpu.metricPrefix = p[0] + + break + } + } + } + + // Host power query assuming GPU power is in host power + hostPowerOnlyQuery = fmt.Sprintf( + `(%s - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (%s%s{job="%s"})))`, + hostPowerQuery, gpu.metricPrefix, amdDevExporterPkgPowerMetric, gpu.job, + ) + + // Prof series names with prefix + var amdProfSeriesPrefix model.LabelValues + for _, n := range amdProfSeries { + amdProfSeriesPrefix = append(amdProfSeriesPrefix, model.LabelValue(gpu.metricPrefix+string(n))) + } + + // For AMD GPUs check if prof metrics are available + gpu.amdProfSeries = intersection(jobSeries[gpu.job], amdProfSeriesPrefix) } // If host power series is cray, we dont need to check if GPU power is in host power // Cray exposes all components separately - if hostPowerSeries == "ceems_cray_pm_counters_power_watts" { - return gpu + if strings.Contains(hostPowerQuery, crayPowerMetric) { + return gpu, hostPowerQuery } // Check if host power includes GPU power or not - query := fmt.Sprintf( - `avg_over_time((label_replace(%s{job="%s"%s}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(%s{job="%s"} / %d, "instancehost", "$1", "instance","([^:]+):\\d+")))[%s:])`, - hostPowerSeries, job, hostPowerLabel, gpu.powerSeries, gpu.job, gpu.powerScaler, etime.Sub(stime).Truncate(time.Minute).String(), - ) + query := fmt.Sprintf(`avg_over_time(%s[%s:])`, hostPowerOnlyQuery, etime.Sub(stime).Truncate(time.Minute).String()) // Make query against Prometheus - if result, _, err := api.Query(ctx, query, etime); err == nil { + result, _, err := api.Query(ctx, query, etime) + if err == nil { // If average value is more than 0, that means Host power includes GPU power if val, ok := result.(model.Vector); ok && len(val) > 0 { if val[0].Value > 0 { - gpu.powerInHostPower = true + return gpu, hostPowerOnlyQuery } } } else { - fmt.Fprintln(os.Stderr, "failed to verify if host power reported by", hostPowerSeries, "for job", job, "includes GPU power. Please make manual check and modify rule appropriately. Error is:", err) + fmt.Fprintln(os.Stderr, "failed to verify if host power reported by", hostPowerQuery, "for job", job, "includes GPU power. Please make manual check and modify rule appropriately. Error is:", err) } - return gpu + return gpu, hostPowerQuery } // renderRules generates recording rules by rendering template files. func renderRules(tmpl *template.Template, tmplData *rulesTemplateData, outDir string) error { // Render the CPU rules template buf := &bytes.Buffer{} - if err := tmpl.ExecuteTemplate(buf, tmplData.TemplateFile, tmplData); err != nil { + + err := tmpl.ExecuteTemplate(buf, tmplData.TemplateFile, tmplData) + if err != nil { return err } // Write to CPU recording rules to file path := filepath.Join(outDir, fmt.Sprintf("%s.rules", tmplData.Job)) - if err := os.WriteFile(path, buf.Bytes(), 0o600); err != nil { + + err = os.WriteFile(path, buf.Bytes(), 0o600) + if err != nil { return err } @@ -603,13 +754,17 @@ func renderRules(tmpl *template.Template, tmplData *rulesTemplateData, outDir st fmt.Fprintln(os.Stderr, "generating recording rules for GPU for job", tmplData.GPU.job, "in file", tmplData.GPU.job+"-gpu.rules") buf := &bytes.Buffer{} - if err := tmpl.ExecuteTemplate(buf, tmplData.GPU.templateFile, tmplData); err != nil { + + err := tmpl.ExecuteTemplate(buf, tmplData.GPU.templateFile, tmplData) + if err != nil { return err } // Write to CPU recording rules to file path := filepath.Join(outDir, fmt.Sprintf("%s-gpu.rules", tmplData.GPU.job)) - if err := os.WriteFile(path, buf.Bytes(), 0o600); err != nil { + + err = os.WriteFile(path, buf.Bytes(), 0o600) + if err != nil { return err } } diff --git a/cmd/ceems_tool/rules/cpu-ipmi-redfish.rules b/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules similarity index 94% rename from cmd/ceems_tool/rules/cpu-ipmi-redfish.rules rename to cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules index d489650a..3688fe7d 100644 --- a/cmd/ceems_tool/rules/cpu-ipmi-redfish.rules +++ b/cmd/ceems_tool/rules/cpu-ipmi-redfish-hwmon.rules @@ -33,18 +33,8 @@ groups: (ceems_compute_unit_memory_total_bytes{job="{{.Job}}"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. -{{- $chassislabel := "" -}} -{{- if ne $.Chassis "" }} -{{- $chassislabel = printf ",chassis=\"%s\"" $.Chassis }} -{{- end }} -{{- $query := "" -}} -{{- if .GPUPowerInHostPower }} -{{- $query = printf "(label_replace(%s{job=\"%s\"%s}, \"instancehost\", \"$1\", \"instance\", \"([^:]+):\\\\d+\") - on (instancehost) group_left () sum by (instancehost) (label_replace(%s{job=\"%s\"} / %d, \"instancehost\", \"$1\", \"instance\", \"([^:]+):\\\\d+\"))) > 0" $.HostPowerSeries $.Job $chassislabel $.GPUPowerSeries $.GPUJob $.GPUPowerScaler }} -{{- else }} -{{- $query = printf "%s{job=\"%s\"%s}" $.HostPowerSeries $.Job $chassislabel }} -{{- end }} - record: instance:{{.HostPowerSeries}}:pue - expr: {{.PUE}} * {{$query}} + expr: {{.PUE}} * {{.HostPowerQuery}} # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -214,7 +204,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) ({{.PUE}} * {{$query}}) + sum by (job) ({{.PUE}} * {{.HostPowerQuery}}) {{- $numproviders := len .Providers -}} {{- if gt $numproviders 0 }} diff --git a/cmd/ceems_tool/rules/gpu-amd-device-metrics.rules b/cmd/ceems_tool/rules/gpu-amd-device-metrics.rules index 39d075bd..29421cd5 100644 --- a/cmd/ceems_tool/rules/gpu-amd-device-metrics.rules +++ b/cmd/ceems_tool/rules/gpu-amd-device-metrics.rules @@ -11,6 +11,7 @@ # We leverage these rules to include PUE (Power Usage Effectiveness) in the Power # estimation as well. # +{{- $labelreplace := "label_replace(label_replace(%s%s{job=\"%s\"}, \"gpuuuid\", \"$1\", \"serial_number\", \"(.*)\"), \"gpuiid\", \"$1\", \"gpu_partition_id\", \"(.*)\")" }} groups: - name: compute-unit-gpu-rules-{{.GPUJob}} interval: {{.EvaluationInterval}} @@ -18,35 +19,64 @@ groups: # GPU Usage (%) by compute unit - record: uuid:ceems_gpu_usage:ratio expr: |2 - gpu_gfx_activity{job="{{.GPUJob}}"} - * on (index) group_right () + {{ printf $labelreplace .GPUMetricPrefix "gpu_gfx_activity" .GPUJob }} + * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} # GPU Memory Usage (%) by compute unit - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: vram expr: |2 ( - gpu_used_vram{job="{{.GPUJob}}"} * 100 + {{ printf $labelreplace .GPUMetricPrefix "gpu_used_vram" .GPUJob }} * 100 / - gpu_total_vram{job="{{.GPUJob}}"} + {{ printf $labelreplace .GPUMetricPrefix "gpu_total_vram" .GPUJob }} > 0 ) - * on (index) group_right () + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: gtt + expr: |2 + ( + {{ printf $labelreplace .GPUMetricPrefix "gpu_used_gtt" .GPUJob }} * 100 + / + {{ printf $labelreplace .GPUMetricPrefix "gpu_total_gtt" .GPUJob }} > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: visiblevram + expr: |2 + ( + {{ printf $labelreplace .GPUMetricPrefix "gpu_used_visible_vram" .GPUJob }} * 100 + / + {{ printf $labelreplace .GPUMetricPrefix "gpu_total_visible_vram" .GPUJob }} > 0 + ) + * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + # For AMD device metrics exporter, gpu_partition 0 is the one that has non zero power usage and + # rest of partitions will report zero. So, we need to always consider partition 0 to get real usage + # of GPU. In the relabel config we change - record: dev:gpu_power_usage_watts:pue - expr: {{.PUE}} * gpu_power_usage{job="{{.GPUJob}}"} + expr: {{.PUE}} * {{ printf $labelreplace .GPUMetricPrefix "gpu_package_power" .GPUJob }} - record: uuid:ceems_gpu_power_watts:pue expr: |2 ( ceems_compute_unit_gpu_sm_count{job="{{.Job}}"} - / on (index) group_left () - (sum by (index) (ceems_compute_unit_gpu_sm_count{job="{{.Job}}"}) > 0) + / on (gpuuuid) group_left () + (sum by (gpuuuid) (ceems_compute_unit_gpu_sm_count{job="{{.Job}}"}) > 0) ) - * on (index) group_right() - dev:gpu_power_usage_watts:pue{job="{{.GPUJob}}"} - * on (index) group_right() + * on (gpuuuid) group_left() + dev:gpu_power_usage_watts:pue{job="{{.GPUJob}}",gpuiid="0"} + * on (gpuuuid,gpuiid,uuid) group_right() ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} {{- $numproviders := len .Providers -}} @@ -58,22 +88,14 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:gpu_power_usage_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.Job}}"} / 3.6e+06, "provider", "{{$v}}", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"} {{ end }} {{- else }} {{- if gt $.EmissionFactor.Value 0.0 }} @@ -84,9 +106,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:gpu_power_usage_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06, "provider", "{{$.EmissionFactor.Provider}}", "instance", @@ -96,6 +116,25 @@ groups: {{- end }} {{- end }} + # Profiling metrics +{{ range $i, $v := .AMDProfSeries }} +{{- $stringname := printf "%s" $v -}} +{{- $metricprefix := printf "%sgpu_" $.GPUMetricPrefix -}} +{{- $splitname := Split $stringname $metricprefix -}} +{{- $suffix := "sum" }} +{{- if or (IsSubString $stringname "active") (IsSubString $stringname "percent") }} +{{- $suffix = "ratio" }} +{{- end }} +{{- $name := index $splitname 1 -}} +{{- $namelower := $name | ToLower }} +{{- $label := printf "uuid:ceems_gpu_%s:%s" $namelower $suffix }} + - record: {{ $label }} + expr: |2 + {{ printf $labelreplace "" $v $.GPUJob }} + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"} +{{ end }} + # The following recording rules estimate the average GPU, GPU memory usages and # total GPU power and its equivalent emissions aggregared for all hosts # per Prometheus job. @@ -105,23 +144,27 @@ groups: rules: # Average GPU Usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_usage:avg - expr: avg by (job) (gpu_gfx_activity{job="{{.GPUJob}}"}) + expr: avg by (job) ({{.GPUMetricPrefix}}gpu_gfx_activity{job="{{.GPUJob}}"}) # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - gpu_used_vram{job="{{.GPUJob}}"} * 100 - / - gpu_total_vram{job="{{.GPUJob}}"} - ) - ) + labels: + type: vram + expr: avg by (job) (({{.GPUMetricPrefix}}gpu_used_vram{job="{{.GPUJob}}"} * 100 / {{.GPUMetricPrefix}}gpu_total_vram{job="{{.GPUJob}}"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: gtt + expr: avg by (job) (({{.GPUMetricPrefix}}gpu_used_gtt{job="{{.GPUJob}}"} * 100 / {{.GPUMetricPrefix}}gpu_total_gtt{job="{{.GPUJob}}"} > 0)) + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: visiblevram + expr: avg by (job) (({{.GPUMetricPrefix}}gpu_used_visible_ram{job="{{.GPUJob}}"} * 100 / {{.GPUMetricPrefix}}gpu_total_visible_ram{job="{{.GPUJob}}"} > 0)) + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job - # AMD GPU power is in micro Watts and we need to convert it to Watts here - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)({{.PUE}} * gpu_power_usage{job="{{.GPUJob}}"} / 1e6) + expr: sum by (job)({{.PUE}} * {{.GPUMetricPrefix}}gpu_package_power{job="{{.GPUJob}}"}) {{- $numproviders := len .Providers -}} {{- if gt $numproviders 0 }} diff --git a/cmd/ceems_tool/rules/gpu-amd-smi.rules b/cmd/ceems_tool/rules/gpu-amd-smi.rules index 45a93c8d..d7ef9039 100644 --- a/cmd/ceems_tool/rules/gpu-amd-smi.rules +++ b/cmd/ceems_tool/rules/gpu-amd-smi.rules @@ -11,6 +11,7 @@ # We leverage these rules to include PUE (Power Usage Effectiveness) in the Power # estimation as well. # +{{- $labelreplace := "label_replace(%s{job=\"%s\"}, \"index\", \"$1\", \"%s\", \"(.*)\")" }} groups: - name: compute-unit-gpu-rules-{{.GPUJob}} interval: {{.EvaluationInterval}} @@ -18,21 +19,21 @@ groups: # GPU Usage (%) by compute unit - record: uuid:ceems_gpu_usage:ratio expr: |2 - amd_gpu_use_percent{job="{{.GPUJob}}"} + {{ printf $labelreplace "amd_gpu_use_percent" .GPUJob "gpu_use_percent" }} * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} # GPU Memory Usage (%) by compute unit - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 - amd_gpu_memory_use_percent{job="{{.GPUJob}}"} + {{ printf $labelreplace "amd_gpu_memory_use_percent" .GPUJob "gpu_memory_use_percent" }} * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. # AMD GPU power is in micro Watts and we need to convert it to Watts here - record: dev:amd_gpu_power_watts:pue - expr: {{.PUE}} * amd_gpu_power{job="{{.GPUJob}}"} / 1e6 + expr: {{.PUE}} * {{ printf $labelreplace "amd_gpu_power" .GPUJob "gpu_power" }} / 1e6 - record: uuid:ceems_gpu_power_watts:pue expr: |2 @@ -41,7 +42,7 @@ groups: / on (index) group_left () (sum by (index) (ceems_compute_unit_gpu_sm_count{job="{{.Job}}"}) > 0) ) - * on (index) group_right() + * on (index) group_left() dev:amd_gpu_power_watts:pue{job="{{.GPUJob}}"} * on (index) group_right() ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} @@ -55,22 +56,14 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:amd_gpu_power_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.Job}}"} / 3.6e+06, "provider", "{{$v}}", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"} {{ end }} {{- else }} {{- if gt $.EmissionFactor.Value 0.0 }} @@ -81,9 +74,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:amd_gpu_power_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06, "provider", "{{$.EmissionFactor.Provider}}", "instance", diff --git a/cmd/ceems_tool/rules/gpu-nvidia.rules b/cmd/ceems_tool/rules/gpu-nvidia.rules index b3e3248c..3ae085d6 100644 --- a/cmd/ceems_tool/rules/gpu-nvidia.rules +++ b/cmd/ceems_tool/rules/gpu-nvidia.rules @@ -7,6 +7,7 @@ # We leverage these rules to include PUE (Power Usage Effectiveness) in the Power # estimation as well. # +{{- $labelreplace := "label_replace(label_replace(%s{job=\"%s\"}, \"gpuuuid\", \"$1\", \"UUID\", \"(.*)\"), \"gpuiid\", \"$1\", \"GPU_I_ID\", \"(.*)\")" }} groups: - name: compute-unit-gpu-rules-{{.GPUJob}} interval: {{.EvaluationInterval}} @@ -17,12 +18,14 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/64#issuecomment-1400811885 # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 {{- $gpuusagemetric := "DCGM_FI_DEV_GPU_UTIL" -}} +{{- $usagefactor := 1 }} {{- if Contains .NVProfSeries "DCGM_FI_PROF_GR_ENGINE_ACTIVE" }} {{- $gpuusagemetric = "DCGM_FI_PROF_GR_ENGINE_ACTIVE" -}} +{{- $usagefactor = 100 }} {{- end }} - record: uuid:ceems_gpu_usage:ratio expr: |2 - {{$gpuusagemetric}}{job="{{.GPUJob}}"} + {{ printf $labelreplace $gpuusagemetric .GPUJob }} * {{$usagefactor}} * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} @@ -30,16 +33,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="{{.GPUJob}}"} * 100 + {{ printf $labelreplace "DCGM_FI_DEV_FB_USED" .GPUJob }} * 100 / - (DCGM_FI_DEV_FB_USED{job="{{.GPUJob}}"} + DCGM_FI_DEV_FB_FREE{job="{{.GPUJob}}"}) + ({{ printf $labelreplace "DCGM_FI_DEV_FB_USED" .GPUJob }} + {{ printf $labelreplace "DCGM_FI_DEV_FB_FREE" .GPUJob }}) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: {{.PUE}} * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="{{.GPUJob}}"} + expr: {{.PUE}} * {{ printf $labelreplace "DCGM_FI_DEV_POWER_USAGE_INSTANT" .GPUJob }} # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -53,10 +56,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="{{.GPUJob}}"} + {{ printf $labelreplace "DCGM_FI_PROF_SM_ACTIVE" .GPUJob }} * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="{{.GPUJob}}"} - * on (gpuuuid, gpuiid) + {{ printf $labelreplace "DCGM_FI_PROF_SM_OCCUPANCY" .GPUJob }} + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="{{.Job}}"} ) ) @@ -64,10 +67,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="{{.GPUJob}}"} + {{ printf $labelreplace "DCGM_FI_PROF_SM_ACTIVE" .GPUJob }} * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="{{.GPUJob}}"} - * on (gpuuuid, gpuiid) + {{ printf $labelreplace "DCGM_FI_PROF_SM_OCCUPANCY" .GPUJob }} + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="{{.Job}}"} ) > @@ -75,9 +78,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="{{.GPUJob}}"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} {{- else }} ( @@ -85,9 +88,9 @@ groups: / on (gpuuuid) group_left () (sum by (gpuuuid) (ceems_compute_unit_gpu_sm_count{job="{{.Job}}"}) > 0) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="{{.GPUJob}}"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="{{.Job}}"} {{- end }} @@ -100,22 +103,14 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.Job}}"} / 3.6e+06, "provider", "{{$v}}", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="{{$.CountryCode}}",provider="{{$v}}"} {{ end }} {{- else }} {{- if gt $.EmissionFactor.Value 0.0 }} @@ -126,9 +121,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="{{$.GPUJob}}"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"}, + uuid:ceems_gpu_power_watts:pue{job="{{$.GPUJob}}"} / 3.6e+06, "provider", "{{$.EmissionFactor.Provider}}", "instance", @@ -141,13 +134,19 @@ groups: # Profiling metrics {{ range $i, $v := .NVProfSeries }} {{- $stringname := printf "%s" $v -}} -{{- $splitname := Split $stringname "DCGM_FI_PROF_" -}} +{{- $splitname := Split $stringname "DCGM_FI_" -}} +{{- $suffix := "ratio" }} +{{- $factor := 100 }} +{{- if IsSubString $stringname "BYTES" }} +{{- $suffix = "rate" }} +{{- $factor = 1 }} +{{- end }} {{- $name := index $splitname 1 -}} {{- $namelower := $name | ToLower }} -{{- $label := printf "uuid:ceems_gpu_%s:ratio" $namelower }} +{{- $label := printf "uuid:ceems_gpu_%s:%s" $namelower $suffix }} - record: {{ $label }} expr: |2 - {{$v}}{job="{{$.GPUJob}}"} * 100 + {{ printf $labelreplace $v $.GPUJob }} * {{$factor}} * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="{{$.Job}}"} {{ end }} diff --git a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt index 98b3af1b..8e7a56ec 100644 --- a/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt +++ b/cmd/ceems_tool/testdata/output/e2e-test-recording-rules-output.txt @@ -1,3 +1,5 @@ +Number of series found for job series are: 77 +Number of series found for uuid series are: 714 amd-device-metrics-gpu-gpu.rules --- # Recording rules for AMD GPUs scrape job amd-device-metrics-gpu. @@ -19,24 +21,212 @@ groups: # GPU Usage (%) by compute unit - record: uuid:ceems_gpu_usage:ratio expr: |2 - gpu_gfx_activity{job="amd-device-metrics-gpu"} - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} + label_replace(label_replace(amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} # GPU Memory Usage (%) by compute unit - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: vram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: gtt + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: visiblevram expr: |2 ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 + label_replace(label_replace(amd_gpu_used_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 / - gpu_total_vram{job="amd-device-metrics-gpu"} + label_replace(label_replace(amd_gpu_total_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + # For AMD device metrics exporter, gpu_partition 0 is the one that has non zero power usage and + # rest of partitions will report zero. So, we need to always consider partition 0 to get real usage + # of GPU. In the relabel config we change + - record: dev:gpu_power_usage_watts:pue + expr: 1 * label_replace(label_replace(amd_gpu_package_power{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + + - record: uuid:ceems_gpu_power_watts:pue + expr: |2 + ( + ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"} + / on (gpuuuid) group_left () + (sum by (gpuuuid) (ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"}) > 0) + ) + * on (gpuuuid) group_left() + dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu",gpuiid="0"} + * on (gpuuuid,gpuiid,uuid) group_right() + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using emission factor from owid for country + # FR + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_gpu_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ) + * on (provider) group_left () + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} + + + # Profiling metrics + + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_sm_active{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_tensor_active_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_tensor_active_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_occupancy_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_occupancy_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_16_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_16_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_32_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_32_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_64_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_64_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_write_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_write_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_fetch_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_fetch_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + + # The following recording rules estimate the average GPU, GPU memory usages and + # total GPU power and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-gpu-rules-amd-device-metrics-gpu + interval: 2s + rules: + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: vram + expr: avg by (job) ((amd_gpu_used_vram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_vram{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: gtt + expr: avg by (job) ((amd_gpu_used_gtt{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_gtt{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: visiblevram + expr: avg by (job) ((amd_gpu_used_visible_ram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_visible_ram{job="amd-device-metrics-gpu"} > 0)) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + - record: job:ceems_gpu_power_watts:pue + expr: sum by (job)(1 * amd_gpu_package_power{job="amd-device-metrics-gpu"}) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated for country FR + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + sum by (job, country_code, country, provider) ( + ( + job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06 + * on (job) group_right () + label_replace(ceems_emissions_gCo2_kWh, "job", "amd-device-metrics-gpu", "instance", "(.*)") + ) ) +amd-smi-gpu-gpu.rules +--- +# Recording rules for AMD GPUs scrape job amd-smi-gpu. +# +# These recording rules are used when AMD SMI exporter is found +# in Prometheus targets +# https://www.amd.com/en/developer/e-sms/amdsmi-library.html +# +# These rules map the GPU usage to the compute unit `uuid` which gives +# GPU metrics for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +groups: + - name: compute-unit-gpu-rules-amd-smi-gpu + interval: 1s + rules: + # GPU Usage (%) by compute unit + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(amd_gpu_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_use_percent", "(.*)") + * on (index) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + expr: |2 + label_replace(amd_gpu_memory_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_memory_use_percent", "(.*)") * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - - record: dev:gpu_power_usage_watts:pue - expr: 1 * gpu_power_usage{job="amd-device-metrics-gpu"} + # AMD GPU power is in micro Watts and we need to convert it to Watts here + - record: dev:amd_gpu_power_watts:pue + expr: 1 * label_replace(amd_gpu_power{job="amd-smi-gpu"}, "index", "$1", "gpu_power", "(.*)") / 1e6 - record: uuid:ceems_gpu_power_watts:pue expr: |2 @@ -45,8 +235,8 @@ groups: / on (index) group_left () (sum by (index) (ceems_compute_unit_gpu_sm_count{job="cpu-cray-amd-gpu"}) > 0) ) - * on (index) group_right() - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} + * on (index) group_left() + dev:amd_gpu_power_watts:pue{job="amd-smi-gpu"} * on (index) group_right() ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} @@ -56,50 +246,35 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="cpu-cray-amd-gpu"} / 3.6e+06, "provider", "owid", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} # The following recording rules estimate the average GPU, GPU memory usages and # total GPU power and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-gpu-rules-amd-device-metrics-gpu + - name: host-agg-gpu-rules-amd-smi-gpu interval: 2s rules: # Average GPU Usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_usage:avg - expr: avg by (job) (gpu_gfx_activity{job="amd-device-metrics-gpu"}) + expr: avg by (job) (amd_gpu_use_percent{job="amd-smi-gpu"}) # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 - / - gpu_total_vram{job="amd-device-metrics-gpu"} - ) - ) + expr: avg by (job) (amd_gpu_memory_use_percent{job="amd-smi-gpu"}) # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job # AMD GPU power is in micro Watts and we need to convert it to Watts here - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)(1 * gpu_power_usage{job="amd-device-metrics-gpu"} / 1e6) + expr: sum by (job)(1 * amd_gpu_power{job="amd-smi-gpu"} / 1e6) # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts # in a Prometheus job accounting PUE value. @@ -108,9 +283,9 @@ groups: expr: |2 sum by (job, country_code, country, provider) ( ( - job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06 + job:ceems_gpu_power_watts:pue{job="amd-smi-gpu"} / 3.6e+06 * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "amd-device-metrics-gpu", "instance", "(.*)") + label_replace(ceems_emissions_gCo2_kWh, "job", "amd-smi-gpu", "instance", "(.*)") ) ) cpu-cray-amd-gpu.rules @@ -289,9 +464,9 @@ groups: ) ) -cpu-ipmi-nvidia-gpu.rules +cpu-hwmon-amd-gpu.rules --- -# Recording rules for scrape job cpu-ipmi-nvidia-gpu +# Recording rules for scrape job cpu-hwmon-amd-gpu # # The following recording rules provide several CPU related metrics of the individual # compute units. Each of these metrics involving multiple raw metrics to compute them. @@ -302,31 +477,31 @@ cpu-ipmi-nvidia-gpu.rules # which should not increase the disk use of TSDB enormously. # groups: - - name: compute-unit-rules-cpu-ipmi-nvidia-gpu + - name: compute-unit-rules-cpu-hwmon-amd-gpu interval: 1s rules: # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - record: uuid:ceems_cpu_usage:ratio_irate expr: |2 ( - irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + - irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) ) * 100 / - (ceems_compute_unit_cpus{job="cpu-ipmi-nvidia-gpu"} > 0) + (ceems_compute_unit_cpus{job="cpu-hwmon-amd-gpu"} > 0) # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to # the available memory to the compute unit. - record: uuid:ceems_cpu_memory_usage:ratio expr: |2 - ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} * 100 + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} * 100 / - (ceems_compute_unit_memory_total_bytes{job="cpu-ipmi-nvidia-gpu"} > 0) + (ceems_compute_unit_memory_total_bytes{job="cpu-hwmon-amd-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_power_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + - record: instance:ceems_hwmon_power_current_watts:pue + expr: 1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"}))) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -359,27 +534,59 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + ) + ) * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( ( - irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + - irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) ) / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) + sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} + * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power + ( + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} + ) + ) + + + 0.1 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( - ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} / ( - ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} * on (instance) group_left () - ceems_compute_units{job="cpu-ipmi-nvidia-gpu"} + ceems_compute_units{job="cpu-hwmon-amd-gpu"} ) > 0 ) @@ -389,7 +596,7 @@ groups: - record: uuid:ceems_host_emissions_g_s:pue expr: |2 label_replace( - uuid:ceems_host_power_watts:pue{job="cpu-ipmi-nvidia-gpu"} / 3.6e+06, + uuid:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -403,7 +610,7 @@ groups: # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-rules-cpu-ipmi-nvidia-gpu + - name: host-agg-rules-cpu-hwmon-amd-gpu interval: 2s rules: # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. @@ -412,12 +619,12 @@ groups: avg by (job) ( ( sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s]) + irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s]) ) * 100 / on (instance) group_left () - ((ceems_cpu_count{job="cpu-ipmi-nvidia-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-ipmi-nvidia-gpu"}) + ((ceems_cpu_count{job="cpu-hwmon-amd-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-hwmon-amd-gpu"}) ) ) @@ -430,7 +637,7 @@ groups: ( 1 - - (ceems_meminfo_MemAvailable_bytes{job="cpu-ipmi-nvidia-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-ipmi-nvidia-gpu"}) + (ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"}) ) ) ) @@ -440,7 +647,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"})))) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -449,14 +656,14 @@ groups: expr: |2 sum by (job, country_code, country, provider) ( ( - job:ceems_host_power_watts:pue{job="cpu-ipmi-nvidia-gpu"} / 3.6e+06 + job:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06 * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-ipmi-nvidia-gpu", "instance", "(.*)") + label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-hwmon-amd-gpu", "instance", "(.*)") ) ) -cpu-only-ipmi.rules +cpu-ipmi-nvidia-gpu.rules --- -# Recording rules for scrape job cpu-only-ipmi +# Recording rules for scrape job cpu-ipmi-nvidia-gpu # # The following recording rules provide several CPU related metrics of the individual # compute units. Each of these metrics involving multiple raw metrics to compute them. @@ -467,31 +674,31 @@ cpu-only-ipmi.rules # which should not increase the disk use of TSDB enormously. # groups: - - name: compute-unit-rules-cpu-only-ipmi + - name: compute-unit-rules-cpu-ipmi-nvidia-gpu interval: 1s rules: # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - record: uuid:ceems_cpu_usage:ratio_irate expr: |2 ( - irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-only-ipmi"}[2s]) + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + - irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-only-ipmi"}[2s]) + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) ) * 100 / - (ceems_compute_unit_cpus{job="cpu-only-ipmi"} > 0) + (ceems_compute_unit_cpus{job="cpu-ipmi-nvidia-gpu"} > 0) # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to # the available memory to the compute unit. - record: uuid:ceems_cpu_memory_usage:ratio expr: |2 - ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} * 100 + ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} * 100 / - (ceems_compute_unit_memory_total_bytes{job="cpu-only-ipmi"} > 0) + (ceems_compute_unit_memory_total_bytes{job="cpu-ipmi-nvidia-gpu"} > 0) # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_ipmi_dcmi_power_current_watts:pue - expr: 1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"} + expr: 1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)")))) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -524,59 +731,27 @@ groups: # - record: uuid:ceems_host_power_watts:pue expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. - * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) - / - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) - ) - ) + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( ( - irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-only-ipmi"}[2s]) + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) + - irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-only-ipmi"}[2s]) + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-ipmi-nvidia-gpu"}[2s]) ) / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s])) - ) - + - 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} - * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power - ( - sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) - / - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) - ) - ) - * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power - ( - ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} - / on (instance) group_left () - ( - ceems_meminfo_MemTotal_bytes{job="cpu-only-ipmi"} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job="cpu-only-ipmi"} - ) + sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s])) ) + - 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-ipmi-nvidia-gpu"} # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( - ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} + ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} / ( - ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} + ceems_compute_unit_memory_used_bytes{job="cpu-ipmi-nvidia-gpu"} * on (instance) group_left () - ceems_compute_units{job="cpu-only-ipmi"} + ceems_compute_units{job="cpu-ipmi-nvidia-gpu"} ) > 0 ) @@ -586,7 +761,7 @@ groups: - record: uuid:ceems_host_emissions_g_s:pue expr: |2 label_replace( - uuid:ceems_host_power_watts:pue{job="cpu-only-ipmi"} / 3.6e+06, + uuid:ceems_host_power_watts:pue{job="cpu-ipmi-nvidia-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -600,7 +775,7 @@ groups: # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-rules-cpu-only-ipmi + - name: host-agg-rules-cpu-ipmi-nvidia-gpu interval: 2s rules: # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. @@ -609,12 +784,209 @@ groups: avg by (job) ( ( sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s]) + irate(ceems_cpu_seconds_total{job="cpu-ipmi-nvidia-gpu",mode!~"idle|iowait|steal"}[2s]) ) * 100 / on (instance) group_left () - ((ceems_cpu_count{job="cpu-only-ipmi"} > 0) / ceems_cpu_per_core_count{job="cpu-only-ipmi"}) + ((ceems_cpu_count{job="cpu-ipmi-nvidia-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-ipmi-nvidia-gpu"}) + ) + ) + + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to + # the available memory to the host. + - record: job:ceems_cpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( + ( + ( + 1 + - + (ceems_meminfo_MemAvailable_bytes{job="cpu-ipmi-nvidia-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-ipmi-nvidia-gpu"}) + ) + ) + ) + * + 100 + + # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. + - record: job:ceems_host_power_watts:pue + expr: |2 + sum by (job) (1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)"))))) + + # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated for country FR + - record: job:ceems_host_emissions_g_s:pue + expr: |2 + sum by (job, country_code, country, provider) ( + ( + job:ceems_host_power_watts:pue{job="cpu-ipmi-nvidia-gpu"} / 3.6e+06 + * on (job) group_right () + label_replace(ceems_emissions_gCo2_kWh, "job", "cpu-ipmi-nvidia-gpu", "instance", "(.*)") + ) + ) +cpu-only-ipmi.rules +--- +# Recording rules for scrape job cpu-only-ipmi +# +# The following recording rules provide several CPU related metrics of the individual +# compute units. Each of these metrics involving multiple raw metrics to compute them. +# Performing such queries involving multiple metrics is a computational intensive +# operation for Prometheus and hence, we leverage recording rules to estimate them +# in the real time and store them in the TSDB. The downside of this approach is that +# it creates new metrics which consume more space. However, we add atmost 10 new metrics +# which should not increase the disk use of TSDB enormously. +# +groups: + - name: compute-unit-rules-cpu-only-ipmi + interval: 1s + rules: + # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. + - record: uuid:ceems_cpu_usage:ratio_irate + expr: |2 + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-only-ipmi"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-only-ipmi"}[2s]) + ) * 100 + / + (ceems_compute_unit_cpus{job="cpu-only-ipmi"} > 0) + + # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to + # the available memory to the compute unit. + - record: uuid:ceems_cpu_memory_usage:ratio + expr: |2 + ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} * 100 + / + (ceems_compute_unit_memory_total_bytes{job="cpu-only-ipmi"} > 0) + + # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + expr: 1 * ceems_ipmi_dcmi_power_current_watts{job="cpu-only-ipmi"} + + # Total host power (Watts) consumed by the compute unit accounting PUE value. + # + # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other + # peripherals like network, storage, etc. + # + # (If the assumption does not fit your infrastructure, you can manually change the values + # in the rules. For instance, if the server has many storage disks, the 10 % can be increased + # further to account for disk power consumption.) + # + # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM + # components, when available. When RAPL counters are not available, we assume all 90% power + # is consumed by CPU. + # + # At node level, power consumed by CPU and DRAM can be estimated as + # + # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) + # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) + # + # Now we have power usage at node level for CPU and DRAM. We split it further at the + # compute unit level using CPU time and DRAM usage by the compute unit. For rest of + # of the power usage like network, storage, we split it equally among all compute units + # that running on the node at a given time. + # + # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) + # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) + # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units + # + # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit + # + - record: uuid:ceems_host_power_watts:pue + expr: |2 + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-only-ipmi"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-only-ipmi"}[2s]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s])) + ) + + + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} + * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power + ( + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-only-ipmi"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-only-ipmi"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power + ( + ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="cpu-only-ipmi"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="cpu-only-ipmi"} + ) + ) + + + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue{job="cpu-only-ipmi"} # Total Misc Power Usage + * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit + ( + ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} + / + ( + ceems_compute_unit_memory_used_bytes{job="cpu-only-ipmi"} + * on (instance) group_left () + ceems_compute_units{job="cpu-only-ipmi"} + ) > 0 + ) + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. + # The equivalent emissions are estimated using emission factor from owid for country + # FR + - record: uuid:ceems_host_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_host_power_watts:pue{job="cpu-only-ipmi"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ) + * on (provider) group_left () + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} + + + # The following recording rules estimate the average CPU, CPU memory usages and + # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-rules-cpu-only-ipmi + interval: 2s + rules: + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. + - record: job:ceems_cpu_usage:avg_ratio_irate + expr: |2 + avg by (job) ( + ( + sum by (job, instance) ( + irate(ceems_cpu_seconds_total{job="cpu-only-ipmi",mode!~"idle|iowait|steal"}[2s]) + ) + * + 100 + / on (instance) group_left () + ((ceems_cpu_count{job="cpu-only-ipmi"} > 0) / ceems_cpu_per_core_count{job="cpu-only-ipmi"}) ) ) @@ -850,7 +1222,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + expr: 1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -996,7 +1368,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"})) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -1047,7 +1419,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"} + expr: 1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"}) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -1193,7 +1565,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"}) + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"})) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -1228,7 +1600,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -1236,16 +1608,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -1258,10 +1630,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) ) @@ -1269,10 +1641,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) > @@ -1280,9 +1652,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. @@ -1291,41 +1663,87 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="cpu-ipmi-nvidia-gpu"} / 3.6e+06, "provider", "owid", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -1389,7 +1807,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} @@ -1397,16 +1815,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -1419,10 +1837,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) ) @@ -1430,10 +1848,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) > @@ -1441,9 +1859,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. @@ -1452,60 +1870,106 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="cpu-redfish-nvidia-gpu"} / 3.6e+06, "provider", "owid", "instance", "(.*)" ) * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, - "common_label", - "mock", - "instance", - "(.*)" - ) + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - # The following recording rules estimate the average GPU, GPU memory usages and - # total GPU power and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-gpu-rules-nvidia-gpu - interval: 2s - rules: - # Average GPU Usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_usage:avg - expr: avg by (job) (DCGM_FI_DEV_GPU_UTIL{job="nvidia-gpu"}) - - # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_memory_usage:avg_ratio + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio expr: |2 - avg by (job) ( + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + + # The following recording rules estimate the average GPU, GPU memory usages and + # total GPU power and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-gpu-rules-nvidia-gpu + interval: 2s + rules: + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (DCGM_FI_DEV_GPU_UTIL{job="nvidia-gpu"}) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( ( DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} * 100 / @@ -1550,24 +2014,217 @@ groups: # GPU Usage (%) by compute unit - record: uuid:ceems_gpu_usage:ratio expr: |2 - gpu_gfx_activity{job="amd-device-metrics-gpu"} - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} + label_replace(label_replace(amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} # GPU Memory Usage (%) by compute unit - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: vram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: gtt + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: visiblevram expr: |2 ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 + label_replace(label_replace(amd_gpu_used_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 / - gpu_total_vram{job="amd-device-metrics-gpu"} + label_replace(label_replace(amd_gpu_total_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + # For AMD device metrics exporter, gpu_partition 0 is the one that has non zero power usage and + # rest of partitions will report zero. So, we need to always consider partition 0 to get real usage + # of GPU. In the relabel config we change + - record: dev:gpu_power_usage_watts:pue + expr: 1 * label_replace(label_replace(amd_gpu_package_power{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + + - record: uuid:ceems_gpu_power_watts:pue + expr: |2 + ( + ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"} + / on (gpuuuid) group_left () + (sum by (gpuuuid) (ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"}) > 0) + ) + * on (gpuuuid) group_left() + dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu",gpuiid="0"} + * on (gpuuuid,gpuiid,uuid) group_right() + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using static emission factor from OWID for country + # FR + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ) + * 44.179085 + + # Profiling metrics + + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_sm_active{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_tensor_active_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_tensor_active_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_occupancy_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_occupancy_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_16_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_16_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_32_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_32_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_64_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_64_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_write_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_write_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_fetch_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_fetch_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + + # The following recording rules estimate the average GPU, GPU memory usages and + # total GPU power and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-gpu-rules-amd-device-metrics-gpu + interval: 2s + rules: + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: vram + expr: avg by (job) ((amd_gpu_used_vram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_vram{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: gtt + expr: avg by (job) ((amd_gpu_used_gtt{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_gtt{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: visiblevram + expr: avg by (job) ((amd_gpu_used_visible_ram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_visible_ram{job="amd-device-metrics-gpu"} > 0)) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + - record: job:ceems_gpu_power_watts:pue + expr: sum by (job)(1 * amd_gpu_package_power{job="amd-device-metrics-gpu"}) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated using static emission factor from OWID for country + # FR + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + label_replace( + label_replace( + 44.179085 * job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ), + "country_code", + "FR", + "instance", + "(.*)" ) +amd-smi-gpu-gpu.rules +--- +# Recording rules for AMD GPUs scrape job amd-smi-gpu. +# +# These recording rules are used when AMD SMI exporter is found +# in Prometheus targets +# https://www.amd.com/en/developer/e-sms/amdsmi-library.html +# +# These rules map the GPU usage to the compute unit `uuid` which gives +# GPU metrics for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +groups: + - name: compute-unit-gpu-rules-amd-smi-gpu + interval: 1s + rules: + # GPU Usage (%) by compute unit + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(amd_gpu_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_use_percent", "(.*)") + * on (index) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + expr: |2 + label_replace(amd_gpu_memory_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_memory_use_percent", "(.*)") * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - - record: dev:gpu_power_usage_watts:pue - expr: 1 * gpu_power_usage{job="amd-device-metrics-gpu"} + # AMD GPU power is in micro Watts and we need to convert it to Watts here + - record: dev:amd_gpu_power_watts:pue + expr: 1 * label_replace(amd_gpu_power{job="amd-smi-gpu"}, "index", "$1", "gpu_power", "(.*)") / 1e6 - record: uuid:ceems_gpu_power_watts:pue expr: |2 @@ -1576,8 +2233,8 @@ groups: / on (index) group_left () (sum by (index) (ceems_compute_unit_gpu_sm_count{job="cpu-cray-amd-gpu"}) > 0) ) - * on (index) group_right() - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} + * on (index) group_left() + dev:amd_gpu_power_watts:pue{job="amd-smi-gpu"} * on (index) group_right() ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} @@ -1587,9 +2244,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="amd-smi-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -1601,28 +2256,21 @@ groups: # total GPU power and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-gpu-rules-amd-device-metrics-gpu + - name: host-agg-gpu-rules-amd-smi-gpu interval: 2s rules: # Average GPU Usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_usage:avg - expr: avg by (job) (gpu_gfx_activity{job="amd-device-metrics-gpu"}) + expr: avg by (job) (amd_gpu_use_percent{job="amd-smi-gpu"}) # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 - / - gpu_total_vram{job="amd-device-metrics-gpu"} - ) - ) + expr: avg by (job) (amd_gpu_memory_use_percent{job="amd-smi-gpu"}) # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job # AMD GPU power is in micro Watts and we need to convert it to Watts here - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)(1 * gpu_power_usage{job="amd-device-metrics-gpu"} / 1e6) + expr: sum by (job)(1 * amd_gpu_power{job="amd-smi-gpu"} / 1e6) # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts # in a Prometheus job accounting PUE value. @@ -1632,7 +2280,7 @@ groups: expr: |2 label_replace( label_replace( - 44.179085 * job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + 44.179085 * job:ceems_gpu_power_watts:pue{job="amd-smi-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -1720,23 +2368,234 @@ groups: ceems_meminfo_MemAvailable_bytes{job="cpu-cray-amd-gpu"} ) ) - + + + + ( + instance:ceems_cray_pm_counters_power_watts:pue{domain="node",job="cpu-cray-amd-gpu"} # Misc Power Usage by Compute Unit + - on (instance) + sum by (instance) (instance:ceems_cray_pm_counters_power_watts:pue{domain!~"node",job="cpu-cray-amd-gpu"}) + ) + * on (instance) group_right () + ( + ceems_compute_unit_memory_used_bytes{job="cpu-cray-amd-gpu"} + / + ( + ceems_compute_unit_memory_used_bytes{job="cpu-cray-amd-gpu"} + * on (instance) group_left () + ceems_compute_units{job="cpu-cray-amd-gpu"} + ) + > + 0 + ) + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. + # The equivalent emissions are estimated using static emission factor from OWID for country + # FR + - record: uuid:ceems_host_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_host_power_watts:pue{job="cpu-cray-amd-gpu"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ) + * 44.179085 + + # The following recording rules estimate the average CPU, CPU memory usages and + # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-rules-cpu-cray-amd-gpu + interval: 2s + rules: + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. + - record: job:ceems_cpu_usage:avg_ratio_irate + expr: |2 + avg by (job) ( + ( + sum by (job, instance) ( + irate(ceems_cpu_seconds_total{job="cpu-cray-amd-gpu",mode!~"idle|iowait|steal"}[2s]) + ) + * + 100 + / on (instance) group_left () + ((ceems_cpu_count{job="cpu-cray-amd-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-cray-amd-gpu"}) + ) + ) + + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to + # the available memory to the host. + - record: job:ceems_cpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( + ( + ( + 1 + - + (ceems_meminfo_MemAvailable_bytes{job="cpu-cray-amd-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-cray-amd-gpu"}) + ) + ) + ) + * + 100 + + # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. + - record: job:ceems_host_power_watts:pue + expr: |2 + sum by (job) ( + 1 + * + ( + ceems_cray_pm_counters_power_watts{domain="node",job="cpu-cray-amd-gpu"} + - on (instance) group_left () + sum by (instance) (ceems_cray_pm_counters_power_watts{domain=~"accel.*",job="cpu-cray-amd-gpu"}) + ) + ) + + # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated using static emission factor from OWID for country + # FR + - record: job:ceems_host_emissions_g_s:pue + expr: |2 + label_replace( + label_replace( + 44.179085 * job:ceems_host_power_watts:pue{job="cpu-cray-amd-gpu"} / 3.6e+06, + "provider", + "owid", + "instance", + "(.*)" + ), + "country_code", + "FR", + "instance", + "(.*)" + ) + +cpu-hwmon-amd-gpu.rules +--- +# Recording rules for scrape job cpu-hwmon-amd-gpu +# +# The following recording rules provide several CPU related metrics of the individual +# compute units. Each of these metrics involving multiple raw metrics to compute them. +# Performing such queries involving multiple metrics is a computational intensive +# operation for Prometheus and hence, we leverage recording rules to estimate them +# in the real time and store them in the TSDB. The downside of this approach is that +# it creates new metrics which consume more space. However, we add atmost 10 new metrics +# which should not increase the disk use of TSDB enormously. +# +groups: + - name: compute-unit-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. + - record: uuid:ceems_cpu_usage:ratio_irate + expr: |2 + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + ) * 100 + / + (ceems_compute_unit_cpus{job="cpu-hwmon-amd-gpu"} > 0) + + # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to + # the available memory to the compute unit. + - record: uuid:ceems_cpu_memory_usage:ratio + expr: |2 + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} * 100 + / + (ceems_compute_unit_memory_total_bytes{job="cpu-hwmon-amd-gpu"} > 0) + + # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. + - record: instance:ceems_hwmon_power_current_watts:pue + expr: 1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"}))) + + # Total host power (Watts) consumed by the compute unit accounting PUE value. + # + # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other + # peripherals like network, storage, etc. + # + # (If the assumption does not fit your infrastructure, you can manually change the values + # in the rules. For instance, if the server has many storage disks, the 10 % can be increased + # further to account for disk power consumption.) + # + # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM + # components, when available. When RAPL counters are not available, we assume all 90% power + # is consumed by CPU. + # + # At node level, power consumed by CPU and DRAM can be estimated as + # + # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) + # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) + # + # Now we have power usage at node level for CPU and DRAM. We split it further at the + # compute unit level using CPU time and DRAM usage by the compute unit. For rest of + # of the power usage like network, storage, we split it equally among all compute units + # that running on the node at a given time. + # + # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) + # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) + # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units + # + # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit + # + - record: uuid:ceems_host_power_watts:pue + expr: |2 + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( - instance:ceems_cray_pm_counters_power_watts:pue{domain="node",job="cpu-cray-amd-gpu"} # Misc Power Usage by Compute Unit - - on (instance) - sum by (instance) (instance:ceems_cray_pm_counters_power_watts:pue{domain!~"node",job="cpu-cray-amd-gpu"}) + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s])) ) - * on (instance) group_right () - ( - ceems_compute_unit_memory_used_bytes{job="cpu-cray-amd-gpu"} + + + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} + * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power + ( + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) / ( - ceems_compute_unit_memory_used_bytes{job="cpu-cray-amd-gpu"} - * on (instance) group_left () - ceems_compute_units{job="cpu-cray-amd-gpu"} + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) ) - > - 0 + ) + * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} + ) + ) + + + 0.1 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Total Misc Power Usage + * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + / + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + * on (instance) group_left () + ceems_compute_units{job="cpu-hwmon-amd-gpu"} + ) > 0 ) # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. @@ -1745,7 +2604,7 @@ groups: - record: uuid:ceems_host_emissions_g_s:pue expr: |2 label_replace( - uuid:ceems_host_power_watts:pue{job="cpu-cray-amd-gpu"} / 3.6e+06, + uuid:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -1757,7 +2616,7 @@ groups: # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-rules-cpu-cray-amd-gpu + - name: host-agg-rules-cpu-hwmon-amd-gpu interval: 2s rules: # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. @@ -1766,12 +2625,12 @@ groups: avg by (job) ( ( sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="cpu-cray-amd-gpu",mode!~"idle|iowait|steal"}[2s]) + irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s]) ) * 100 / on (instance) group_left () - ((ceems_cpu_count{job="cpu-cray-amd-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-cray-amd-gpu"}) + ((ceems_cpu_count{job="cpu-hwmon-amd-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-hwmon-amd-gpu"}) ) ) @@ -1784,7 +2643,7 @@ groups: ( 1 - - (ceems_meminfo_MemAvailable_bytes{job="cpu-cray-amd-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-cray-amd-gpu"}) + (ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"}) ) ) ) @@ -1794,15 +2653,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) ( - 1 - * - ( - ceems_cray_pm_counters_power_watts{domain="node",job="cpu-cray-amd-gpu"} - - on (instance) group_left () - sum by (instance) (ceems_cray_pm_counters_power_watts{domain=~"accel.*",job="cpu-cray-amd-gpu"}) - ) - ) + sum by (job) (1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"})))) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -1812,7 +2663,7 @@ groups: expr: |2 label_replace( label_replace( - 44.179085 * job:ceems_host_power_watts:pue{job="cpu-cray-amd-gpu"} / 3.6e+06, + 44.179085 * job:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -1823,7 +2674,6 @@ groups: "instance", "(.*)" ) - cpu-ipmi-nvidia-gpu.rules --- # Recording rules for scrape job cpu-ipmi-nvidia-gpu @@ -1861,7 +2711,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_ipmi_dcmi_power_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + expr: 1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)")))) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -1973,7 +2823,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)"))))) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2400,7 +3250,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + expr: 1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -2544,7 +3394,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"})) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2602,7 +3452,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + expr: 1 * (sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}) - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)")))) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -2746,7 +3596,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}) - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)"))))) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -2788,7 +3638,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -2796,16 +3646,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -2818,10 +3668,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) ) @@ -2829,10 +3679,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) > @@ -2840,9 +3690,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. @@ -2851,9 +3701,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="ipmi-nvidia-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -2863,21 +3711,75 @@ groups: # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -2948,7 +3850,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} @@ -2956,16 +3858,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -2978,10 +3880,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) ) @@ -2989,10 +3891,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) > @@ -3000,9 +3902,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. @@ -3011,9 +3913,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="nvidia-gpu"} / 3.6e+06, "provider", "owid", "instance", @@ -3023,21 +3923,75 @@ groups: # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} @@ -3102,30 +4056,223 @@ amd-device-metrics-gpu-gpu.rules # estimation as well. # groups: - - name: compute-unit-gpu-rules-amd-device-metrics-gpu + - name: compute-unit-gpu-rules-amd-device-metrics-gpu + interval: 1s + rules: + # GPU Usage (%) by compute unit + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(label_replace(amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: vram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: gtt + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_gtt{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: visiblevram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_visible_vram{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + # For AMD device metrics exporter, gpu_partition 0 is the one that has non zero power usage and + # rest of partitions will report zero. So, we need to always consider partition 0 to get real usage + # of GPU. In the relabel config we change + - record: dev:gpu_power_usage_watts:pue + expr: 1 * label_replace(label_replace(amd_gpu_package_power{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + + - record: uuid:ceems_gpu_power_watts:pue + expr: |2 + ( + ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"} + / on (gpuuuid) group_left () + (sum by (gpuuuid) (ceems_compute_unit_gpu_sm_count{job="cpu-hwmon-amd-gpu"}) > 0) + ) + * on (gpuuuid) group_left() + dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu",gpuiid="0"} + * on (gpuuuid,gpuiid,uuid) group_right() + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using static emission factor from OWID for country + # + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + "provider", + "custom", + "instance", + "(.*)" + ) + * 50 + + # Profiling metrics + + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_sm_active{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_tensor_active_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_tensor_active_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_occupancy_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_occupancy_percent{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_16_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_16_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_32_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_32_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_total_64_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_64_ops{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_write_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_write_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + - record: uuid:ceems_gpu_prof_fetch_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_fetch_size{job="amd-device-metrics-gpu"}, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-hwmon-amd-gpu"} + + + # The following recording rules estimate the average GPU, GPU memory usages and + # total GPU power and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-gpu-rules-amd-device-metrics-gpu + interval: 2s + rules: + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (amd_gpu_gfx_activity{job="amd-device-metrics-gpu"}) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: vram + expr: avg by (job) ((amd_gpu_used_vram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_vram{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: gtt + expr: avg by (job) ((amd_gpu_used_gtt{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_gtt{job="amd-device-metrics-gpu"} > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: visiblevram + expr: avg by (job) ((amd_gpu_used_visible_ram{job="amd-device-metrics-gpu"} * 100 / amd_gpu_total_visible_ram{job="amd-device-metrics-gpu"} > 0)) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + - record: job:ceems_gpu_power_watts:pue + expr: sum by (job)(1 * amd_gpu_package_power{job="amd-device-metrics-gpu"}) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated using static emission factor from OWID for country + # + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + label_replace( + label_replace( + 50 * job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + "provider", + "custom", + "instance", + "(.*)" + ), + "country_code", + "", + "instance", + "(.*)" + ) +amd-smi-gpu-gpu.rules +--- +# Recording rules for AMD GPUs scrape job amd-smi-gpu. +# +# These recording rules are used when AMD SMI exporter is found +# in Prometheus targets +# https://www.amd.com/en/developer/e-sms/amdsmi-library.html +# +# These rules map the GPU usage to the compute unit `uuid` which gives +# GPU metrics for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +groups: + - name: compute-unit-gpu-rules-amd-smi-gpu interval: 1s rules: # GPU Usage (%) by compute unit - record: uuid:ceems_gpu_usage:ratio expr: |2 - gpu_gfx_activity{job="amd-device-metrics-gpu"} + label_replace(amd_gpu_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_use_percent", "(.*)") * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} # GPU Memory Usage (%) by compute unit - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 - ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 - / - gpu_total_vram{job="amd-device-metrics-gpu"} - ) + label_replace(amd_gpu_memory_use_percent{job="amd-smi-gpu"}, "index", "$1", "gpu_memory_use_percent", "(.*)") * on (index) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - - record: dev:gpu_power_usage_watts:pue - expr: 1 * gpu_power_usage{job="amd-device-metrics-gpu"} + # AMD GPU power is in micro Watts and we need to convert it to Watts here + - record: dev:amd_gpu_power_watts:pue + expr: 1 * label_replace(amd_gpu_power{job="amd-smi-gpu"}, "index", "$1", "gpu_power", "(.*)") / 1e6 - record: uuid:ceems_gpu_power_watts:pue expr: |2 @@ -3134,8 +4281,8 @@ groups: / on (index) group_left () (sum by (index) (ceems_compute_unit_gpu_sm_count{job="cpu-cray-amd-gpu"}) > 0) ) - * on (index) group_right() - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} + * on (index) group_left() + dev:amd_gpu_power_watts:pue{job="amd-smi-gpu"} * on (index) group_right() ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"} @@ -3145,9 +4292,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:gpu_power_usage_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-cray-amd-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="amd-smi-gpu"} / 3.6e+06, "provider", "custom", "instance", @@ -3159,28 +4304,21 @@ groups: # total GPU power and its equivalent emissions aggregared for all hosts # per Prometheus job. # - - name: host-agg-gpu-rules-amd-device-metrics-gpu + - name: host-agg-gpu-rules-amd-smi-gpu interval: 2s rules: # Average GPU Usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_usage:avg - expr: avg by (job) (gpu_gfx_activity{job="amd-device-metrics-gpu"}) + expr: avg by (job) (amd_gpu_use_percent{job="amd-smi-gpu"}) # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - record: job:ceems_gpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - gpu_used_vram{job="amd-device-metrics-gpu"} * 100 - / - gpu_total_vram{job="amd-device-metrics-gpu"} - ) - ) + expr: avg by (job) (amd_gpu_memory_use_percent{job="amd-smi-gpu"}) # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job # AMD GPU power is in micro Watts and we need to convert it to Watts here - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)(1 * gpu_power_usage{job="amd-device-metrics-gpu"} / 1e6) + expr: sum by (job)(1 * amd_gpu_power{job="amd-smi-gpu"} / 1e6) # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts # in a Prometheus job accounting PUE value. @@ -3190,7 +4328,7 @@ groups: expr: |2 label_replace( label_replace( - 50 * job:ceems_gpu_power_watts:pue{job="amd-device-metrics-gpu"} / 3.6e+06, + 50 * job:ceems_gpu_power_watts:pue{job="amd-smi-gpu"} / 3.6e+06, "provider", "custom", "instance", @@ -3382,6 +4520,208 @@ groups: "(.*)" ) +cpu-hwmon-amd-gpu.rules +--- +# Recording rules for scrape job cpu-hwmon-amd-gpu +# +# The following recording rules provide several CPU related metrics of the individual +# compute units. Each of these metrics involving multiple raw metrics to compute them. +# Performing such queries involving multiple metrics is a computational intensive +# operation for Prometheus and hence, we leverage recording rules to estimate them +# in the real time and store them in the TSDB. The downside of this approach is that +# it creates new metrics which consume more space. However, we add atmost 10 new metrics +# which should not increase the disk use of TSDB enormously. +# +groups: + - name: compute-unit-rules-cpu-hwmon-amd-gpu + interval: 1s + rules: + # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. + - record: uuid:ceems_cpu_usage:ratio_irate + expr: |2 + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + ) * 100 + / + (ceems_compute_unit_cpus{job="cpu-hwmon-amd-gpu"} > 0) + + # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to + # the available memory to the compute unit. + - record: uuid:ceems_cpu_memory_usage:ratio + expr: |2 + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} * 100 + / + (ceems_compute_unit_memory_total_bytes{job="cpu-hwmon-amd-gpu"} > 0) + + # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. + - record: instance:ceems_hwmon_power_current_watts:pue + expr: 1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"}))) + + # Total host power (Watts) consumed by the compute unit accounting PUE value. + # + # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other + # peripherals like network, storage, etc. + # + # (If the assumption does not fit your infrastructure, you can manually change the values + # in the rules. For instance, if the server has many storage disks, the 10 % can be increased + # further to account for disk power consumption.) + # + # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM + # components, when available. When RAPL counters are not available, we assume all 90% power + # is consumed by CPU. + # + # At node level, power consumed by CPU and DRAM can be estimated as + # + # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) + # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) + # + # Now we have power usage at node level for CPU and DRAM. We split it further at the + # compute unit level using CPU time and DRAM usage by the compute unit. For rest of + # of the power usage like network, storage, we split it equally among all compute units + # that running on the node at a given time. + # + # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) + # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) + # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units + # + # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit + # + - record: uuid:ceems_host_power_watts:pue + expr: |2 + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + + + irate(ceems_compute_unit_cpu_system_seconds_total{job="cpu-hwmon-amd-gpu"}[2s]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s])) + ) + + + 0.9 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} + * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power + ( + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total{job="cpu-hwmon-amd-gpu"}[2s])) + ) + ) + * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + / on (instance) group_left () + ( + ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"} + - on (instance) + ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} + ) + ) + + + 0.1 * instance:ceems_hwmon_power_current_watts:pue{job="cpu-hwmon-amd-gpu"} # Total Misc Power Usage + * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + / + ( + ceems_compute_unit_memory_used_bytes{job="cpu-hwmon-amd-gpu"} + * on (instance) group_left () + ceems_compute_units{job="cpu-hwmon-amd-gpu"} + ) > 0 + ) + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. + # The equivalent emissions are estimated using static emission factor from OWID for country + # + - record: uuid:ceems_host_emissions_g_s:pue + expr: |2 + label_replace( + uuid:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, + "provider", + "custom", + "instance", + "(.*)" + ) + * 50 + + # The following recording rules estimate the average CPU, CPU memory usages and + # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts + # per Prometheus job. + # + - name: host-agg-rules-cpu-hwmon-amd-gpu + interval: 2s + rules: + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. + - record: job:ceems_cpu_usage:avg_ratio_irate + expr: |2 + avg by (job) ( + ( + sum by (job, instance) ( + irate(ceems_cpu_seconds_total{job="cpu-hwmon-amd-gpu",mode!~"idle|iowait|steal"}[2s]) + ) + * + 100 + / on (instance) group_left () + ((ceems_cpu_count{job="cpu-hwmon-amd-gpu"} > 0) / ceems_cpu_per_core_count{job="cpu-hwmon-amd-gpu"}) + ) + ) + + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to + # the available memory to the host. + - record: job:ceems_cpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( + ( + ( + 1 + - + (ceems_meminfo_MemAvailable_bytes{job="cpu-hwmon-amd-gpu"} / ceems_meminfo_MemTotal_bytes{job="cpu-hwmon-amd-gpu"}) + ) + ) + ) + * + 100 + + # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. + - record: job:ceems_host_power_watts:pue + expr: |2 + sum by (job) (1 * (sum without (sensor) (ceems_hwmon_power_current_watts{job="cpu-hwmon-amd-gpu",chip="socket"}) - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power{job="amd-device-metrics-gpu"})))) + + # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs + # in a Prometheus job accounting PUE value. + # The equivalent emissions are estimated using static emission factor from OWID for country + # + - record: job:ceems_host_emissions_g_s:pue + expr: |2 + label_replace( + label_replace( + 50 * job:ceems_host_power_watts:pue{job="cpu-hwmon-amd-gpu"} / 3.6e+06, + "provider", + "custom", + "instance", + "(.*)" + ), + "country_code", + "", + "instance", + "(.*)" + ) cpu-ipmi-nvidia-gpu.rules --- # Recording rules for scrape job cpu-ipmi-nvidia-gpu @@ -3419,7 +4759,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_ipmi_dcmi_power_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + expr: 1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)")))) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -3531,7 +4871,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * (ceems_ipmi_dcmi_power_current_watts{job="cpu-ipmi-nvidia-gpu"} - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "hostname", "$1", "Hostname","(.*)"))))) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -3958,7 +5298,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"} + expr: 1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -4102,7 +5442,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"}) + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-only-redfish",chassis="Chassis_1"})) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -4160,7 +5500,7 @@ groups: # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - record: instance:ceems_redfish_power_current_watts:pue - expr: 1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 + expr: 1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"}) # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -4304,7 +5644,7 @@ groups: # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue expr: |2 - sum by (job) (1 * (label_replace(ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_2"}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) + sum by (job) (1 * sum without (chassis) (ceems_redfish_power_current_watts{job="cpu-redfish-nvidia-gpu",chassis="Chassis_1"})) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. @@ -4346,7 +5686,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -4354,16 +5694,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -4376,10 +5716,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) ) @@ -4387,10 +5727,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-ipmi-nvidia-gpu"} ) > @@ -4398,9 +5738,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. @@ -4409,9 +5749,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="ipmi-nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="ipmi-nvidia-gpu"} / 3.6e+06, "provider", "custom", "instance", @@ -4421,21 +5759,75 @@ groups: # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="ipmi-nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="ipmi-nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-ipmi-nvidia-gpu"} @@ -4506,7 +5898,7 @@ groups: # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 - record: uuid:ceems_gpu_usage:ratio expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} @@ -4514,16 +5906,16 @@ groups: - record: uuid:ceems_gpu_memory_usage:ratio expr: |2 ( - DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 / - (DCGM_FI_DEV_FB_USED{job="nvidia-gpu"} + DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}) + (label_replace(label_replace(DCGM_FI_DEV_FB_USED{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + label_replace(label_replace(DCGM_FI_DEV_FB_FREE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)")) ) * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"} + expr: 1 * label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") # When profiling metrics are available, we split the total power consumed by physical # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This @@ -4536,10 +5928,10 @@ groups: ( ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) ) @@ -4547,10 +5939,10 @@ groups: ( sum by (gpuuuid) ( ( - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * on (gpuuuid, gpuiid) - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") + * on (gpuuuid, gpuiid) group_right () ceems_compute_unit_gpu_sm_count{job="cpu-redfish-nvidia-gpu"} ) > @@ -4558,9 +5950,9 @@ groups: ) ) ) - * on (gpuuuid) group_right () + * on (gpuuuid,gpuiid) group_left () dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} - * on (gpuuuid, gpuiid) group_right () + * on (gpuuuid,gpuiid,uuid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit's GPUs. @@ -4569,9 +5961,7 @@ groups: - record: uuid:ceems_gpu_emissions_g_s:pue expr: |2 label_replace( - dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue{job="nvidia-gpu"} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"}, + uuid:ceems_gpu_power_watts:pue{job="nvidia-gpu"} / 3.6e+06, "provider", "custom", "instance", @@ -4581,21 +5971,75 @@ groups: # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 100 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 + * on (gpuuuid,gpuiid) group_right () + ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_sm_occupancy:ratio + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} - - record: uuid:ceems_gpu_gr_engine_active:ratio + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job="nvidia-gpu"} * 100 + label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES{job="nvidia-gpu"}, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)") * 1 * on (gpuuuid,gpuiid) group_right () ceems_compute_unit_gpu_index_flag{job="cpu-redfish-nvidia-gpu"} @@ -4658,25 +6102,31 @@ queries: avg_gpu_mem_usage: + global: avg_over_time(avg by (uuid) (uuid:ceems_gpu_memory_usage:ratio{uuid=~"{{.UUIDs}}"} > 0 < inf)[{{.Range}}:]) avg_gpu_usage: + global: avg_over_time(avg by (uuid) (uuid:ceems_gpu_usage:ratio{uuid=~"{{.UUIDs}}"} > 0 < inf)[{{.Range}}:]) total_cpu_emissions_gms: + owid_total: sum_over_time(sum by (uuid) (uuid:ceems_host_emissions_g_s:pue{uuid=~"{{.UUIDs}}",provider="owid"} > 0 < inf)[{{.Range}}:{{.ScrapeInterval}}]) * {{.ScrapeIntervalMilli}} / 1e3 total_cpu_energy_usage_kwh: + total: sum_over_time(sum by (uuid) (uuid:ceems_host_power_watts:pue{uuid=~"{{.UUIDs}}"} > 0 < inf)[{{.Range}}:{{.ScrapeInterval}}]) * {{.ScrapeIntervalMilli}} / 3.6e9 total_gpu_emissions_gms: + owid_total: sum_over_time(sum by (uuid) (uuid:ceems_gpu_emissions_g_s:pue{uuid=~"{{.UUIDs}}",provider="owid"} > 0 < inf)[{{.Range}}:{{.ScrapeInterval}}]) * {{.ScrapeIntervalMilli}} / 1e3 total_gpu_energy_usage_kwh: + total: sum_over_time(sum by (uuid) (uuid:ceems_gpu_power_watts:pue{uuid=~"{{.UUIDs}}"} > 0 < inf)[{{.Range}}:{{.ScrapeInterval}}]) * {{.ScrapeIntervalMilli}} / 3.6e9 diff --git a/cmd/ceems_tool/testdata/output/e2e-test-relabel-config-output.txt b/cmd/ceems_tool/testdata/output/e2e-test-relabel-config-output.txt index 82a19185..eeaa64a3 100644 --- a/cmd/ceems_tool/testdata/output/e2e-test-relabel-config-output.txt +++ b/cmd/ceems_tool/testdata/output/e2e-test-relabel-config-output.txt @@ -40,6 +40,24 @@ scrape_configs: action: labeldrop - regex: GPU_I_ID action: labeldrop + - job: amd-device-metrics-gpu + metric_relabel_configs: + - source_labels: + - gpu_id + target_label: index + regex: (.*) + replacement: $1 + action: replace + - source_labels: + - gpu_partition_id + target_label: gpuiid + regex: (.*) + replacement: $1 + action: replace + - regex: gpu_id + action: labeldrop + - regex: gpu_partition_id + action: labeldrop - job: amd-smi-gpu metric_relabel_configs: - source_labels: @@ -68,22 +86,7 @@ scrape_configs: action: labeldrop - regex: gpu_memory_use_percent action: labeldrop - - job: amd-device-metrics-gpu - metric_relabel_configs: - - source_labels: - - gpu_id - target_label: index - regex: (.*) - replacement: $1 - action: replace - - source_labels: - - gpu_partition_id - target_label: gpuiid - regex: (.*) - replacement: $1 - action: replace - - regex: gpu_id - action: labeldrop - - regex: gpu_partition_id - action: labeldrop +WARNING: +Starting from v0.11.0 the relabelling of metrics is handled by recording rules generated by ceems_tool +There is no need to add the generated relabel_configs to Prometheus' scrape configs diff --git a/cmd/ceems_tool/testdata/output/e2e-test-web-config-output.txt b/cmd/ceems_tool/testdata/output/e2e-test-web-config-output.txt index 1d0bf760..5fb2a2d3 100644 --- a/cmd/ceems_tool/testdata/output/e2e-test-web-config-output.txt +++ b/cmd/ceems_tool/testdata/output/e2e-test-web-config-output.txt @@ -25,26 +25,9 @@ ceems_meminfo_MemFree_bytes{hostname=""} 4.50891776e+08 # HELP ceems_meminfo_MemTotal_bytes Memory information field MemTotal_bytes. # TYPE ceems_meminfo_MemTotal_bytes gauge ceems_meminfo_MemTotal_bytes{hostname=""} 1.6042172416e+10 -# HELP ceems_rapl_dram_joules_total Current RAPL dram value in joules -# TYPE ceems_rapl_dram_joules_total counter -ceems_rapl_dram_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0:0"} 24468.409791 -ceems_rapl_dram_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1:0"} 64930.394974 -# HELP ceems_rapl_dram_power_limit_watts_total Current RAPL dram power limit in watts -# TYPE ceems_rapl_dram_power_limit_watts_total counter -ceems_rapl_dram_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0:0"} 0 -ceems_rapl_dram_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1:0"} 0 -# HELP ceems_rapl_package_joules_total Current RAPL package value in joules -# TYPE ceems_rapl_package_joules_total counter -ceems_rapl_package_joules_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 258218.293244 -ceems_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 130570.505826 -# HELP ceems_rapl_package_power_limit_watts_total Current RAPL package power limit in watts -# TYPE ceems_rapl_package_power_limit_watts_total counter -ceems_rapl_package_power_limit_watts_total{hostname="",index="0",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:0"} 180 -ceems_rapl_package_power_limit_watts_total{hostname="",index="1",path="pkg/collector/testdata/sys/class/powercap/intel-rapl:1"} 180 # HELP ceems_scrape_collector_duration_seconds ceems_exporter: Duration of a collector scrape. # TYPE ceems_scrape_collector_duration_seconds gauge # HELP ceems_scrape_collector_success ceems_exporter: Whether a collector succeeded. # TYPE ceems_scrape_collector_success gauge ceems_scrape_collector_success{collector="cpu"} 1 ceems_scrape_collector_success{collector="meminfo"} 1 -ceems_scrape_collector_success{collector="rapl"} 1 diff --git a/cmd/ceems_tool/testdata/prometheus.yml b/cmd/ceems_tool/testdata/prometheus.yml index 6a17f26b..a804a908 100644 --- a/cmd/ceems_tool/testdata/prometheus.yml +++ b/cmd/ceems_tool/testdata/prometheus.yml @@ -4,6 +4,10 @@ global: scrape_interval: 500ms evaluation_interval: 1s + +rule_files: + - ../../../etc/prometheus/rules/*.rules + # scrape configuration scrape_configs: # Only CPU hosts with IPMI available @@ -12,6 +16,7 @@ scrape_configs: - targets: ["localhost:9010"] labels: instance: compute-ipmi-0:9010 + hostname: compute-ipmi-0 # Only CPU hosts whith only RAPL available - job_name: cpu-only-rapl @@ -19,6 +24,7 @@ scrape_configs: - targets: ["localhost:9011"] labels: instance: compute-rapl-0:9011 + hostname: compute-rapl-0 # Only CPU hosts with Redfish available with ONE CHASSIS - job_name: cpu-only-redfish @@ -30,6 +36,7 @@ scrape_configs: - targets: ["localhost:9012"] labels: instance: compute-redfish-0:9012 + hostname: compute-redfish-0 # CPU and NVIDIA GPU hosts with Redfish available with TWO CHASSIS - job_name: cpu-redfish-nvidia-gpu @@ -37,6 +44,7 @@ scrape_configs: - targets: ["localhost:9013"] labels: instance: compute-redfish-0:9013 + hostname: compute-redfish-0 # NVIDIA DCGM job for above hosts - job_name: nvidia-gpu @@ -44,6 +52,7 @@ scrape_configs: - targets: ["localhost:9400"] labels: instance: compute-redfish-0:9400 + Hostname: compute-redfish-0 # CPU and AMD GPUs with Cray available - job_name: cpu-cray-amd-gpu @@ -51,6 +60,7 @@ scrape_configs: - targets: ["localhost:9014"] labels: instance: compute-cray-0:9014 + hostname: compute-cray-0 # AMD SMI job for above hosts - job_name: amd-smi-gpu @@ -59,19 +69,29 @@ scrape_configs: labels: instance: compute-cray-0:9500 + # CPU and AMD GPUs with Hwmon available + - job_name: cpu-hwmon-amd-gpu + static_configs: + - targets: ["localhost:9015"] + labels: + instance: compute-hwmon-0:9015 + hostname: compute-hwmon-0 + # AMD Device metrics job for above hosts - job_name: amd-device-metrics-gpu static_configs: - targets: ["localhost:9600"] labels: - instance: compute-cray-0:9600 + instance: compute-hwmon-0:9600 + hostname: compute-hwmon-0 # CPU and NVIDIA GPU hosts with IPMI available and including GPU power - job_name: cpu-ipmi-nvidia-gpu static_configs: - - targets: ["localhost:9015"] + - targets: ["localhost:9016"] labels: - instance: compute-ipmi-nvidia-0:9015 + instance: compute-ipmi-nvidia-0:9016 + hostname: compute-ipmi-nvidia-0 # NVIDIA DCGM job for above hosts - job_name: ipmi-nvidia-gpu @@ -79,10 +99,12 @@ scrape_configs: - targets: ["localhost:9400"] labels: instance: compute-ipmi-nvidia-0:9400 + Hostname: compute-ipmi-nvidia-0 # Emissions job - job_name: emissions static_configs: - - targets: ["localhost:9016"] + - targets: ["localhost:9017"] labels: - instance: compute-emissions:9016 + instance: compute-emissions:9017 + hostname: compute-emissions diff --git a/etc/prometheus/README.md b/etc/prometheus/README.md index a0e8191b..d95af9dc 100644 --- a/etc/prometheus/README.md +++ b/etc/prometheus/README.md @@ -18,56 +18,66 @@ found in [docs](https://ceems-dev.github.io/ceems/docs/usage/ceems-tool). The following recording rules files are provided for reference purposes and `ceems_tool` must be preferred to generate recording rules. -### `host-power-ipmi.rules` +### [`host-usage.rules`](./rules/host-usage.rules) -The rules defined in this file are meant to be used for group of nodes that do not -have any GPUs and IPMI DCMI is capable of reporting host power. The rules make the -following assumptions: +The rules in this file estimate the host CPU and CPU memory usage for each +compute unit and also average usage aggregated over Prometheus jobs. -- Total host power is reported by IPMI DCMI -- RAPL counters are available for both CPU and DRAM packages - -The provided rules estimate the power usage of individual compute units based on -compute unit CPU and DRAM usage and total node's CPU and DRAM usage. More details -are provided in the comments of the rules file. +### [`host-power-cray-pmc.rules`](./rules/host-power-cray-pmc.rules) -### `host-power-cray-pmc.rules` - -The rules defined in this file are meant to be used for group of Cray nodes where -PMC counters are used to get host power usage. The rules make the following -assumptions: +The rules defined in this file estimate host power usage for the nodes where Cray +PM counters are available. The rules make the following assumptions: - Total host power is reported by Cray PM counters. -The provided rules estimate the power usage of individual compute units based on -compute unit CPU and DRAM usage and total node's CPU and DRAM usage. More details -are provided in the comments of the rules file. +The provided rules estimate the power usage of individual compute units and total +power usage of nodes aggregated over Prometheus jobs. -### `host-power-redfish.rules` +### [`host-power-redfish.rules`](./rules/host-power-redfish.rules) -The rules defined in this file are meant to be used for group of nodes that use -Redfish to report host power. The rules make the following assumptions: +The rules defined in this file estimate host power usage for the nodes where Redfish +reports power usage. The rules make the following assumptions: - Total host power is reported by Redfish. Chassis that reports host power usage must be used. -- RAPL counters are available for both CPU and DRAM packages -The provided rules estimate the power usage of individual compute units based on -compute unit CPU and DRAM usage and total node's CPU and DRAM usage. More details -are provided in the comments of the rules file. +The provided rules estimate the power usage of individual compute units and total +power usage of nodes aggregated over Prometheus jobs. + +### [`host-power-hwmon.rules`](./rules/host-power-hwmon.rules) + +The rules defined in this file estimate host power usage for the nodes where HWMon +counters report power usage. The rules make the following assumptions: + +- Total host power is reported by HWMon counters. Chip that reports host power usage +must be used. -### `host-power-rapl.rules` +The provided rules estimate the power usage of individual compute units and total +power usage of nodes aggregated over Prometheus jobs. -The rules defined in this file are meant to be used for group of nodes that uses +### [`host-power-ipmi.rules`](./rules/host-power-ipmi.rules) + +The rules defined in this file estimate host power usage for the nodes where in-band +IPMI DCMI reports power usage. The rules make the following assumptions: + +- Total host power is reported by in-band IPMI DCMI. If power usage reported by +IPMI DCMI includes GPU power usage (if present on the node), appropriate rule +must be selected from the rule file to exclude GPU power usage + +The provided rules estimate the power usage of individual compute units and total +power usage of nodes aggregated over Prometheus jobs. + +### [`host-power-rapl.rules`](./rules/host-power-rapl.rules) + +The rules defined in this file estimate host power usage for the nodes that exposes only RAPL counters to get host power usage. The rules make the following assumptions: - RAPL counters are available for both CPU and DRAM packages -The provided rules estimate the power usage of individual compute units based on -compute unit CPU and DRAM usage and total node's CPU and DRAM usage. More details -are provided in the comments of the rules file. +The provided rules estimate the power usage of individual compute units and total +power usage of nodes aggregated over Prometheus jobs. -### `host-power-ipmi-with-nvidia-gpus.rules` + + +### [`nvidia-dcgm-gpu.rules`](./rules/nvidia-dcgm-gpu.rules) + +The rules defined in this file estimate GPU usage, power usage and profiling metrics +for the nodes that have NVIDIA GPUs monitored by +[NVIDIA DCGM exporter](https://github.com/NVIDIA/dcgm-exporter). + +The provided rules estimate different GPU metrics of individual compute units and sum/average +over all nodes aggregated over Prometheus jobs. + +### [`amd-device-metrics-gpu.rules`](./rules/amd-device-metrics-gpu.rules) + +The rules defined in this file estimate GPU usage, power usage and profiling metrics +for the nodes that have AMD GPUs monitored by +[AMD Device metrics exporter](https://github.com/ROCm/device-metrics-exporter). -### `nvidia-gpu.rules` +The provided rules estimate different GPU metrics of individual compute units and sum/average +over all nodes aggregated over Prometheus jobs. -The rules defined in this file are meant to be used for group of nodes that have -NVIDIA GPUs. The rules compute few derived metrics from metrics reported -by [NVIDIA DCGM exporter](https://github.com/NVIDIA/dcgm-exporter) that are -relevant to monitor overall cluster status. +### [`amd-smi-gpu.rules`](./rules/amd-smi-gpu.rules) -### `amd-gpu.rules` +The rules defined in this file estimate GPU usage and power usage +for the nodes that have AMD GPUs monitored by +[AMD SMI exporter](https://github.com/amd/amd_smi_exporter). -The rules defined in this file are meant to be used for group of nodes that have -AMD GPUs. The rules compute few derived metrics from metrics reported -by [AMD SMI exporter](https://github.com/amd/amd_smi_exporter) that are -relevant to monitor overall cluster status. +The provided rules estimate different GPU metrics of individual compute units and sum/average +over all nodes aggregated over Prometheus jobs. ## Installing rules @@ -151,7 +174,7 @@ sed -e 's///g' -e 's////g' gpu.rules > cluster_rules/dcgm-a100-partition-1.rules ``` --> -After generating rules using `ceems_tool` or after replacing placeholders in the references rule files +After generating rules using `ceems_tool` or appropriately modifying the references rule files provided in this repository, we need to make sure they are valid. This can be done using [`promtool`](https://prometheus.io/docs/prometheus/latest/command-line/promtool/). Assuming generated rule files are placed in `myrules` folder: diff --git a/etc/prometheus/rules/amd-device-metrics-gpu.rules b/etc/prometheus/rules/amd-device-metrics-gpu.rules new file mode 100644 index 00000000..08cc2077 --- /dev/null +++ b/etc/prometheus/rules/amd-device-metrics-gpu.rules @@ -0,0 +1,230 @@ +--- +# Recording rules for AMD GPUs using device metrics exporter. +# These rules are generated assuming the prefix `amd_` used for +# devices metrics exporter config. +# +# These rules provide AMD GPU metrics fetched from +# device metrics exporter (https://instinct.docs.amd.com/projects/device-metrics-exporter/en/latest/index.html) +# for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +# Optional placeholders to replace: +# +# : PUE value +# : Evaluation interval +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). +# +groups: + - name: amd-dev-exporter-gpu-usage-rules + # interval: + rules: + # GPU Usage (%) by compute unit + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(label_replace(amd_gpu_gfx_activity, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: vram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_vram, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_vram, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: gtt + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_gtt, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_gtt, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_memory_usage:ratio + labels: + type: visiblevram + expr: |2 + ( + label_replace(label_replace(amd_gpu_used_visible_vram, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") * 100 + / + label_replace(label_replace(amd_gpu_total_visible_vram, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") > 0 + ) + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (amd_gpu_gfx_activity) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: vram + expr: avg by (job) ((amd_gpu_used_vram * 100 / amd_gpu_total_vram > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: gtt + expr: avg by (job) ((amd_gpu_used_gtt * 100 / amd_gpu_total_gtt > 0)) + + - record: job:ceems_gpu_memory_usage:avg_ratio + labels: + type: visiblevram + expr: avg by (job) ((amd_gpu_used_visible_ram * 100 / amd_gpu_total_visible_ram > 0)) + + - name: amd-dev-exporter-gpu-power-usage-rules + # interval: + rules: + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + - record: dev:gpu_power_usage_watts:pue + labels: + ceemspowersources: amd-dev-metrics + expr: label_replace(label_replace(amd_gpu_package_power, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") # * + + # For AMD device metrics exporter, gpu_partition 0 is the one that has non zero power usage and + # rest of partitions will report zero. So, we need to always consider partition 0 to get real usage + # of GPU. In the relabel config we change + - record: uuid:ceems_gpu_power_watts:pue + labels: + ceemspowersources: amd-dev-metrics + expr: |2 + ( + ceems_compute_unit_gpu_sm_count + / on (gpuuuid,hostname) group_left () + (sum by (gpuuuid,hostname) (ceems_compute_unit_gpu_sm_count) > 0) + ) + * on (gpuuuid,hostname) group_left() + dev:gpu_power_usage_watts:pue{gpuiid="0"} + * on (gpuuuid,gpuiid,hostname,uuid) group_right() + ceems_compute_unit_gpu_index_flag + + # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using emission factor from owid for country FR + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(uuid:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "amd-dev-metrics", + "instance", + "(.*)" + ) + ) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + - record: job:ceems_gpu_power_watts:pue + labels: + ceemspowersources: amd-dev-metrics + expr: sum by (job)(dev:gpu_power_usage_watts:pue) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(job:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "amd-dev-metrics", + "instance", + "(.*)" + ) + ) + + # Profiling metrics + - name: amd-dev-exporter-gpu-profiling-rules + # interval: + rules: + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_sm_active, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_tensor_active_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_tensor_active_percent, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_occupancy_percent:ratio + expr: |2 + label_replace(label_replace(amd_gpu_prof_occupancy_percent, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_total_16_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_16_ops, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_total_32_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_32_ops, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_total_64_ops:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_total_64_ops, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_write_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_write_size, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_fetch_size:sum + expr: |2 + label_replace(label_replace(amd_gpu_prof_fetch_size, "gpuuuid", "$1", "serial_number", "(.*)"), "gpuiid", "$1", "gpu_partition_id", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # Average usage over all hosts aggregated by Prometheus job + - record: job:ceems_gpu_prof_sm_active:ratio + expr: avg by (job) (amd_gpu_prof_sm_active) + + - record: job:ceems_gpu_prof_tensor_active_percent:ratio + expr: avg by (job) (amd_gpu_prof_tensor_active_percent) + + - record: job:ceems_gpu_prof_occupancy_percent:ratio + expr: avg by (job) (amd_gpu_prof_occupancy_percent) + + - record: job:ceems_gpu_prof_total_16_ops:sum + expr: avg by (job) (amd_gpu_prof_total_16_ops) + + - record: job:ceems_gpu_prof_total_32_ops:sum + expr: avg by (job) (amd_gpu_prof_total_32_ops) + + - record: job:ceems_gpu_prof_total_64_ops:sum + expr: avg by (job) (amd_gpu_prof_total_64_ops) + + - record: job:ceems_gpu_prof_write_size:sum + expr: avg by (job) (amd_gpu_prof_write_size) + + - record: job:ceems_gpu_prof_fetch_size:sum + expr: avg by (job) (amd_gpu_prof_fetch_size) diff --git a/etc/prometheus/rules/amd-gpu.rules b/etc/prometheus/rules/amd-gpu.rules deleted file mode 100644 index b2aac048..00000000 --- a/etc/prometheus/rules/amd-gpu.rules +++ /dev/null @@ -1,100 +0,0 @@ ---- -# Recording rules for AMD GPUs scrape job . -# -# These rules map the GPU usage to the compute unit `uuid` which gives -# GPU metrics for each compute unit. -# -# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power -# estimation as well. -# -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host -# : Evaluation interval -# : Rate interval -# -groups: - - name: compute-unit-gpu-rules- - interval: - rules: - # GPU Usage (%) by compute unit - - record: uuid:ceems_gpu_usage:ratio - expr: |2 - amd_gpu_use_percent{job=""} - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - # GPU Memory Usage (%) by compute unit - - record: uuid:ceems_gpu_memory_usage:ratio - expr: |2 - amd_gpu_memory_use_percent{job=""} - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - # AMD GPU power is in micro Watts and we need to convert it to Watts here - - record: dev:amg_gpu_power_watts:pue - expr: 1 * amg_gpu_power{job=""} / 1e6 - - - record: uuid:ceems_gpu_power_watts:pue - expr: |2 - dev:amg_gpu_power_watts:pue{job=""} - * on (index) group_right() - ceems_compute_unit_gpu_index_flag{job=""} - - # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. - # The equivalent emissions are estimated using emission factor from owid for country - # FR - - record: uuid:ceems_gpu_emissions_g_s:pue - expr: |2 - label_replace( - dev:amg_gpu_power_watts:pue{job=""} / 3.6e+06 - * on (index) group_right () - ceems_compute_unit_gpu_index_flag{job=""}, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, - "common_label", - "mock", - "instance", - "(.*)" - ) - - - # The following recording rules estimate the average GPU, GPU memory usages and - # total GPU power and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-gpu-rules- - interval: - rules: - # Average GPU Usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_usage:avg - expr: avg by (job) (amd_gpu_use_percent{job=""}) - - # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_memory_usage:avg_ratio - expr: avg by (job) (amd_gpu_memory_use_percent{job=""}) - - # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job - # AMD GPU power is in micro Watts and we need to convert it to Watts here - - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)(1 * amg_gpu_power{job=""} / 1e6) - - # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts - # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - - record: job:ceems_gpu_emissions_g_s:pue - expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_gpu_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) - ) diff --git a/etc/prometheus/rules/amd-smi-gpu.rules b/etc/prometheus/rules/amd-smi-gpu.rules new file mode 100644 index 00000000..96eb7701 --- /dev/null +++ b/etc/prometheus/rules/amd-smi-gpu.rules @@ -0,0 +1,108 @@ +--- +# Recording rules for AMD GPUs using SMI exporter. +# +# These rules provide AMD GPU metrics fetched from +# AMD SMI exporter (https://www.amd.com/en/developer/e-sms/amdsmi-library.html) +# for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +# Optional placeholders to replace: +# +# : PUE value +# : Evaluation interval +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). +# +groups: + - name: amd-smi-exporter-gpu-usage-rules + # interval: + rules: + # GPU Usage (%) by compute unit + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(label_replace(amd_gpu_use_percent, "index", "$1", "gpu_use_percent", "(.*)"), "hostname", "$1", "instance", "([^:]+):\\d+") + * on (index,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + expr: |2 + label_replace(label_replace(amd_gpu_memory_use_percent, "index", "$1", "gpu_memory_use_percent", "(.*)"), "hostname", "$1", "instance", "([^:]+):\\d+") + * on (index,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (amd_gpu_use_percent) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + expr: avg by (job) (amd_gpu_memory_use_percent) + + - name: amd-smi-exporter-gpu-power-usage-rules + # interval: + rules: + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + # AMD GPU power is in micro Watts and we need to convert it to Watts here + - record: dev:amd_gpu_power_watts:pue + labels: + ceemspowersources: amd-smi + expr: (label_replace(label_replace(amd_gpu_power, "index", "$1", "gpu_power", "(.*)"), "hostname", "$1", "instance", "([^:]+):\\d+") / 1e6) # * + + - record: uuid:ceems_gpu_power_watts:pue + labels: + ceemspowersources: amd-smi + expr: |2 + ( + ceems_compute_unit_gpu_sm_count + / on (index,hostname) group_left () + (sum by (index,hostname) (ceems_compute_unit_gpu_sm_count) > 0) + ) + * on (index,hostname) group_left() + dev:amd_gpu_power_watts:pue + * on (index,hostname) group_right() + ceems_compute_unit_gpu_index_flag + + # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using emission factor from owid for country FR + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(uuid:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "amd-smi", + "instance", + "(.*)" + ) + ) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + # AMD GPU power is in micro Watts and we need to convert it to Watts here + - record: job:ceems_gpu_power_watts:pue + labels: + ceemspowersources: amd-smi + expr: sum by (job)(dev:amd_gpu_power_watts:pue) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(job:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "amd-smi", + "instance", + "(.*)" + ) + ) diff --git a/etc/prometheus/rules/host-power-cray-pmc.rules b/etc/prometheus/rules/host-power-cray-pmc.rules index acb98964..31a8afd0 100644 --- a/etc/prometheus/rules/host-power-cray-pmc.rules +++ b/etc/prometheus/rules/host-power-cray-pmc.rules @@ -1,45 +1,31 @@ --- -# Recording rules for scrape job +# Recording rules for compute unit host power usage reported by Cray PM counters. # -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. +# The following rules estimate the power usage of compute units and their equivalent +# emissions when Cray PM counters on available on the nodes. # -# Placeholders to replace: -# : Prometheus job name +# Optional placeholders to replace: +# +# : PUE value # : Evaluation interval -# : Rate interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). # groups: - - name: compute-unit-rules- - interval: + - name: cray-power-usage-rules + # interval: rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - # Total host power (Watts) consumed by the instance by accouting Power Usage Effectiveness (PUE) value. - record: instance:ceems_cray_pm_counters_power_watts:pue - expr: 1 * ceems_cray_pm_counters_power_watts{job=""} + labels: + ceemspowersources: cray-pmc + expr: ceems_cray_pm_counters_power_watts # * # Total host power (Watts) consumed by the compute unit accouting PUE value. # @@ -55,126 +41,97 @@ groups: # This misc power is split equally among all running compute units at a given time. # - record: uuid:ceems_host_power_watts:pue + labels: + ceemspowersources: cray-pmc expr: |2 - instance:ceems_cray_pm_counters_power_watts:pue{domain="cpu",job=""} # CPU Power Usage by Compute Unit - * on (instance) group_right () - ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) - ) - + - instance:ceems_cray_pm_counters_power_watts:pue{domain="memory",job=""} # CPU Memory Power Usage by Compute Unit - * on (instance) group_right () - ( - ceems_compute_unit_memory_used_bytes{job=""} - / on (instance) group_left () + instance:ceems_cray_pm_counters_power_watts:pue{domain="cpu"} + * on (instance) group_right () + ( ( - ceems_meminfo_MemTotal_bytes{job=""} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job=""} + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m])) ) - ) - + - ( - instance:ceems_cray_pm_counters_power_watts:pue{domain="node",job=""} # Misc Power Usage by Compute Unit - - on (instance) - sum by (instance) (instance:ceems_cray_pm_counters_power_watts:pue{domain!~"node",job=""}) + > + 0 + or on (instance) + (ceems_compute_unit_cpu_user_seconds_total * 0) ) + + + instance:ceems_cray_pm_counters_power_watts:pue{domain="memory"} * on (instance) group_right () ( - ceems_compute_unit_memory_used_bytes{job=""} - / ( - ceems_compute_unit_memory_used_bytes{job=""} - * on (instance) group_left () - ceems_compute_units{job=""} + ceems_compute_unit_memory_used_bytes + / on (instance) group_left () + (ceems_meminfo_MemTotal_bytes - on (instance) ceems_meminfo_MemAvailable_bytes) ) - > - 0 + > + 0 + or on (instance) + (ceems_compute_unit_memory_used_bytes * 0) ) + + + ( + instance:ceems_cray_pm_counters_power_watts:pue{domain="node"} + - on (instance) + sum by (instance) (instance:ceems_cray_pm_counters_power_watts:pue{domain!~"node"}) + ) + * on (instance) group_right () + ( + ceems_compute_unit_memory_used_bytes + / + (ceems_compute_unit_memory_used_bytes * on (instance) group_left () ceems_compute_units) + > + 0 + ) # Total equivalent emissions rate (g/s) due to the host power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR + # The equivalent emissions are estimated using emission factor from owid for country FR - record: uuid:ceems_host_emissions_g_s:pue expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) + ( + label_replace(uuid:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "cray-pmc", + "instance", + "(.*)" ) - ) - * - 100 + ) # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue + labels: + ceemspowersources: cray-pmc expr: |2 sum by (job) ( - 1 - * ( - ceems_cray_pm_counters_power_watts{domain="node",job=""} + instance:ceems_cray_pm_counters_power_watts:pue{domain="node"} - on (instance) group_left () - sum by (instance) (ceems_cray_pm_counters_power_watts{domain=~"accel.*",job=""}) + sum by (instance) (instance:ceems_cray_pm_counters_power_watts:pue{domain=~"accel.*"}) ) ) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - record: job:ceems_host_emissions_g_s:pue expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) + ( + label_replace(job:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "cray-pmc", + "instance", + "(.*)" + ) ) diff --git a/etc/prometheus/rules/host-power-hwmon.rules b/etc/prometheus/rules/host-power-hwmon.rules new file mode 100644 index 00000000..5646bcb7 --- /dev/null +++ b/etc/prometheus/rules/host-power-hwmon.rules @@ -0,0 +1,185 @@ +--- +# Recording rules for compute unit host power usage reported by HWMon. +# +# The following rules estimate the power usage of compute units and their equivalent +# emissions when HWMon power usage sensors are available on the nodes. +# +# HWMon organize the hardware into different chips and reports the power usage of +# different chip components. Depending on the server type, we should only include +# the chip that report the host power usage excluding any GPUs. +# +# DEFAULT RULES TAKE POWER SUMMED OVER ALL CHIPS AND SENSORS. RULES NEEDS TO BE APPROPRIATELY +# MODIFIED BASED ON AVAILABLE CHIPS. +# +# Optional placeholders to replace: +# +# : PUE value +# : Evaluation interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). +# +groups: + - name: hwmon-power-usage-rules + # interval: + rules: + # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. + # To use selective chip components, replace .* in the rule by chassis regex, eg, "Chip_1|Chip_2" + - record: instance:ceems_hwmon_power_current_watts:pue + labels: + ceemspowersources: hwmon-rapl + expr: sum without (sensor) (ceems_hwmon_power_current_watts{chip=~".*"}) # * + + # Total host power (Watts) consumed by the compute unit accounting PUE value. + # + # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other + # peripherals like network, storage, etc. + # + # (If the assumption does not fit your infrastructure, you can manually change the values + # in the rules. For instance, if the server has many storage disks, the 10 % can be increased + # further to account for disk power consumption.) + # + # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM + # components, when available. When RAPL counters are not available, we assume all 90% power + # is consumed by CPU. + # + # At node level, power consumed by CPU and DRAM can be estimated as + # + # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) + # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) + # + # Now we have power usage at node level for CPU and DRAM. We split it further at the + # compute unit level using CPU time and DRAM usage by the compute unit. For rest of + # of the power usage like network, storage, we split it equally among all compute units + # that running on the node at a given time. + # + # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) + # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) + # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units + # + # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit + # + - record: uuid:ceems_host_power_watts:pue + labels: + ceemspowersources: hwmon-rapl + expr: |2 + 0.9 * instance:ceems_hwmon_power_current_watts:pue # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power + ( + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) + ) + > + 0 + or + sum by (instance) ( + instance:ceems_hwmon_power_current_watts:pue / instance:ceems_hwmon_power_current_watts:pue + ) + ) + * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power + ( + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m])) + ) + > + 0 + or on (instance) + ceems_compute_unit_cpu_user_seconds_total * 0 + ) + + + 0.9 * instance:ceems_hwmon_power_current_watts:pue + * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power + ( + ( + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) + ) + > + 0 + or + sum by (instance) (instance:ceems_hwmon_power_current_watts:pue * 0) + ) + * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power + ( + ( + ceems_compute_unit_memory_used_bytes + / on (instance) group_left () + (ceems_meminfo_MemTotal_bytes - on (instance) ceems_meminfo_MemAvailable_bytes) + ) + > + 0 + or on (instance) + ceems_compute_unit_memory_used_bytes * 0 + ) + + + 0.1 * instance:ceems_hwmon_power_current_watts:pue # Total Misc Power Usage + * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit + ( + ceems_compute_unit_memory_used_bytes + / + ( + ceems_compute_unit_memory_used_bytes + * on (instance) group_left () + ceems_compute_units + ) > 0 + ) + + # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. + # The equivalent emissions are estimated using emission factor from owid for country FR + - record: uuid:ceems_host_emissions_g_s:pue + expr: |2 + ( + label_replace(uuid:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "hwmon-rapl", + "instance", + "(.*)" + ) + ) + + # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. + - record: job:ceems_host_power_watts:pue + labels: + ceemspowersources: hwmon-rapl + expr: sum by (job) (instance:ceems_hwmon_power_current_watts:pue) + + # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs + # in a Prometheus job accounting PUE value. + - record: job:ceems_host_emissions_g_s:pue + expr: |2 + ( + label_replace(job:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "hwmon-rapl", + "instance", + "(.*)" + ) + ) diff --git a/etc/prometheus/rules/host-power-ipmi-with-nvidia-gpu.rules b/etc/prometheus/rules/host-power-ipmi-with-nvidia-gpu.rules deleted file mode 100644 index d4ca44a4..00000000 --- a/etc/prometheus/rules/host-power-ipmi-with-nvidia-gpu.rules +++ /dev/null @@ -1,170 +0,0 @@ ---- -# Recording rules for scrape job -# -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. -# -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host -# : Evaluation interval -# : Rate interval -# -groups: - - name: compute-unit-rules- - interval: - rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - - # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * (label_replace(ceems_ipmi_dcmi_current_watts{job=""}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE{job=""} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 - - # Total host power (Watts) consumed by the compute unit accounting PUE value. - # - # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other - # peripherals like network, storage, etc. - # - # (If the assumption does not fit your infrastructure, you can manually change the values - # in the rules. For instance, if the server has many storage disks, the 10 % can be increased - # further to account for disk power consumption.) - # - # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM - # components, when available. When RAPL counters are not available, we assume all 90% power - # is consumed by CPU. - # - # At node level, power consumed by CPU and DRAM can be estimated as - # - # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) - # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) - # - # Now we have power usage at node level for CPU and DRAM. We split it further at the - # compute unit level using CPU time and DRAM usage by the compute unit. For rest of - # of the power usage like network, storage, we split it equally among all compute units - # that running on the node at a given time. - # - # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) - # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) - # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units - # - # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit - # - - record: uuid:ceems_host_power_watts:pue - expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job=""} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. - * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power - ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) - ) - + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job=""} # Total Misc Power Usage - * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit - ( - ceems_compute_unit_memory_used_bytes{job=""} - / - ( - ceems_compute_unit_memory_used_bytes{job=""} - * on (instance) group_left () - ceems_compute_units{job=""} - ) > 0 - ) - - # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR - - record: uuid:ceems_host_emissions_g_s:pue - expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) - ) - ) - * - 100 - - # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - - record: job:ceems_host_power_watts:pue - expr: |2 - sum by (job) (1 * (label_replace(ceems_ipmi_dcmi_current_watts{job=""}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(DCGM_FI_DEV_POWER_USAGE{job="ipmi-nvidia-gpu"} / 1, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) - - # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs - # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - - record: job:ceems_host_emissions_g_s:pue - expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) - ) diff --git a/etc/prometheus/rules/host-power-ipmi.rules b/etc/prometheus/rules/host-power-ipmi.rules index 487f1385..749ea815 100644 --- a/etc/prometheus/rules/host-power-ipmi.rules +++ b/etc/prometheus/rules/host-power-ipmi.rules @@ -1,46 +1,57 @@ --- -# Recording rules for scrape job +# Recording rules for compute unit host power usage reported by IPMI DCMI. # -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. +# The following rules estimate the power usage of compute units and their equivalent +# emissions when IPMI DCMI power usage is available on the nodes. # -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host +# On some servers (with GPUs) and/or BMCs, the power reported by IPMI DCMI might contain +# power usage by GPUs as well. In that case we need to deduce the GPU power usage from +# power usage reported by IPMI DCMI to estimate the host power usage. Depending on the +# GPU type and exporter used to export GPU metrics, different rules must be used. They +# are included in the file as comments and depending on your server, use the +# appropriate rule for `instance:ceems_ipmi_dcmi_power_current_watts:pue` +# +# Optional placeholders to replace: +# +# : PUE value # : Evaluation interval -# : Rate interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). # groups: - - name: compute-unit-rules- - interval: + - name: ipmi-power-usage-rules + # interval: rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_ipmi_dcmi_current_watts:pue - expr: 1 * ceems_ipmi_dcmi_current_watts{job=""} + - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + labels: + ceemspowersources: ipmi-rapl + expr: ceems_ipmi_dcmi_power_current_watts # * + + # # On some servers and BMCs, the power usage reported by IPMI DCMI might contain the power consumption + # # of the GPUs too (if they are present). If that is the case, we need to remove the GPU power usage + # # from the power usage reported by IPMI DCMI to get host only power usage. + # # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. + # # + # # The following rule must be used when NVIDIA GPUs power usage is included in IPMI DCMI power usage. + # - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + # expr: (ceems_ipmi_dcmi_power_current_watts - on (hostname) group_left () sum by (hostname) (avg by (hostname,device) (label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT, "hostname", "$1", "Hostname","(.*)")))) # * + # # + # # The following rule must be used when AMD GPUs power usage is included in IPMI DCMI power usage. + # # and AMD device metric exporter is used and metric labels are prefixed by `amd_`. + # - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + # expr: (ceems_ipmi_dcmi_power_current_watts - on (hostname) group_left () sum by (hostname) (sum by (hostname,serial_number) (amd_gpu_package_power))) # * + # # + # # The following rule must be used when AMD GPUs power usage is included in IPMI DCMI power usage. + # # and AMD SMI exporter is used. + # - record: instance:ceems_ipmi_dcmi_power_current_watts:pue + # expr: (ceems_ipmi_dcmi_power_current_watts - on (hostname) group_left () sum by (hostname) (label_replace(amd_gpu_power, "hostname", "$1", "instance", "([^:]+):\\d+") / 1e6)) # * # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -72,131 +83,121 @@ groups: # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit # - record: uuid:ceems_host_power_watts:pue + labels: + ceemspowersources: ipmi-rapl expr: |2 - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job=""} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - / ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) ) - ) + > + 0 + or + sum by (instance) ( + instance:ceems_ipmi_dcmi_power_current_watts:pue / instance:ceems_ipmi_dcmi_power_current_watts:pue + ) + ) * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m])) + ) + > + 0 + or on (instance) + ceems_compute_unit_cpu_user_seconds_total * 0 ) + - 0.9 * instance:ceems_ipmi_dcmi_current_watts:pue{job=""} + 0.9 * instance:ceems_ipmi_dcmi_power_current_watts:pue * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) - / ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) ) - ) + > + 0 + or + sum by (instance) (instance:ceems_ipmi_dcmi_power_current_watts:pue * 0) + ) * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power ( - ceems_compute_unit_memory_used_bytes{job=""} - / on (instance) group_left () - ( - ceems_meminfo_MemTotal_bytes{job=""} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job=""} - ) + ( + ceems_compute_unit_memory_used_bytes + / on (instance) group_left () + (ceems_meminfo_MemTotal_bytes - on (instance) ceems_meminfo_MemAvailable_bytes) + ) + > + 0 + or on (instance) + ceems_compute_unit_memory_used_bytes * 0 ) + - 0.1 * instance:ceems_ipmi_dcmi_current_watts:pue{job=""} # Total Misc Power Usage + 0.1 * instance:ceems_ipmi_dcmi_power_current_watts:pue # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( - ceems_compute_unit_memory_used_bytes{job=""} + ceems_compute_unit_memory_used_bytes / ( - ceems_compute_unit_memory_used_bytes{job=""} + ceems_compute_unit_memory_used_bytes * on (instance) group_left () - ceems_compute_units{job=""} + ceems_compute_units ) > 0 ) # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR + # The equivalent emissions are estimated using emission factor from owid for country FR - record: uuid:ceems_host_emissions_g_s:pue expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) + ( + label_replace(uuid:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "ipmi-rapl", + "instance", + "(.*)" ) - ) - * - 100 + ) # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue - expr: |2 - sum by (job) (1 * ceems_ipmi_dcmi_current_watts{job=""}) + labels: + ceemspowersources: ipmi-rapl + expr: sum by (job) (instance:ceems_ipmi_dcmi_power_current_watts:pue) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - record: job:ceems_host_emissions_g_s:pue expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) + ( + label_replace(job:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "ipmi-rapl", + "instance", + "(.*)" + ) ) diff --git a/etc/prometheus/rules/host-power-rapl.rules b/etc/prometheus/rules/host-power-rapl.rules index e9b922e6..86bcd769 100644 --- a/etc/prometheus/rules/host-power-rapl.rules +++ b/etc/prometheus/rules/host-power-rapl.rules @@ -1,52 +1,39 @@ --- -# Recording rules for scrape job +# Recording rules for compute unit host power usage reported by RAPL. # -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. +# The following rules estimate the power usage of compute units and their equivalent +# emissions when RAPL energy counters are available on the nodes. # -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host +# Optional placeholders to replace: +# +# : PUE value # : Evaluation interval -# : Rate interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). # groups: - - name: compute-unit-rules- - interval: + - name: rapl-power-usage-rules + # interval: rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. # By default, a PUE of 1 is used. - record: instance:ceems_rapl_package_joules_total:pue - expr: 1 * ceems_rapl_package_joules_total{job=""} + labels: + ceemspowersources: rapl + expr: ceems_rapl_package_joules_total # * # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. # By default, a PUE of 1 is used. - record: instance:ceems_rapl_dram_joules_total:pue - expr: 1 * ceems_rapl_dram_joules_total{job=""} + labels: + ceemspowersources: rapl + expr: ceems_rapl_dram_joules_total # * # Total power (Watts) consumed by the compute unit accounting PUE value. # @@ -59,109 +46,84 @@ groups: # Compute Unit Host Power = Compute Unit CPU Power + Compute Unit Memory Power # - record: uuid:ceems_host_power_watts:pue + labels: + ceemspowersources: rapl expr: |2 ( - sum by (instance) (irate(instance:ceems_rapl_package_joules_total:pue{job=""}[])) - * on (instance) group_right () - ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) - ) - + - sum by (instance) (irate(instance:ceems_rapl_dram_joules_total:pue{job=""}[])) - * on (instance) group_right () - ( - ceems_compute_unit_memory_used_bytes{job=""} - / on (instance) group_left () - ( - ceems_meminfo_MemTotal_bytes{job=""} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job=""} - ) - ) + sum by (instance) (irate(instance:ceems_rapl_package_joules_total:pue[1m])) + * on (instance) group_right () + ( + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m])) + ) + > + 0 + or on (instance) + (ceems_compute_unit_cpu_user_seconds_total * 0) + ) + + + sum by (instance) (irate(instance:ceems_rapl_dram_joules_total:pue[1m])) + * on (instance) group_right () + ( + ( + ceems_compute_unit_memory_used_bytes + / on (instance) group_left () + (ceems_meminfo_MemTotal_bytes - on (instance) ceems_meminfo_MemAvailable_bytes) + ) + > + 0 + or on (instance) + (ceems_compute_unit_memory_used_bytes * 0) + ) ) # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR + # The equivalent emissions are estimated using emission factor from owid for country FR - record: uuid:ceems_host_emissions_g_s:pue expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) + ( + label_replace(uuid:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "rapl", + "instance", + "(.*)" ) - ) - * - 100 + ) # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue + labels: + ceemspowersources: rapl expr: |2 sum by (job) ( - 1 - * ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) + sum by (instance) (irate(instance:ceems_rapl_package_joules_total:pue[1m])) + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) + sum by (instance) (irate(instance:ceems_rapl_package_joules_total:pue[1m])) ) ) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - record: job:ceems_host_emissions_g_s:pue expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) + ( + label_replace(job:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "rapl", + "instance", + "(.*)" + ) ) diff --git a/etc/prometheus/rules/host-power-redfish-with-amd-gpu.rules b/etc/prometheus/rules/host-power-redfish-with-amd-gpu.rules deleted file mode 100644 index cac1b17a..00000000 --- a/etc/prometheus/rules/host-power-redfish-with-amd-gpu.rules +++ /dev/null @@ -1,203 +0,0 @@ ---- -# Recording rules for scrape job -# -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. -# -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host -# : Chassis name of Redfish which reports the host power usage including GPUs -# : Evaluation interval -# : Rate interval -# -groups: - - name: compute-unit-rules- - interval: - rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - - # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * (label_replace(ceems_redfish_current_watts{job="",chassis=""}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(amg_gpu_power_watts{job=""} / 1e6, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0 - - # Total host power (Watts) consumed by the compute unit accounting PUE value. - # - # Firstly, we make an assumption that 90% of power is consumed by CPU, DRAM and 10% by other - # peripherals like network, storage, etc. - # - # (If the assumption does not fit your infrastructure, you can manually change the values - # in the rules. For instance, if the server has many storage disks, the 10 % can be increased - # further to account for disk power consumption.) - # - # We leverage RAPL package and DRAM counters to split the rest of 90% power between CPU and DRAM - # components, when available. When RAPL counters are not available, we assume all 90% power - # is consumed by CPU. - # - # At node level, power consumed by CPU and DRAM can be estimated as - # - # Total CPU Power = 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) - # Total CPU DRAM Power = 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) - # - # Now we have power usage at node level for CPU and DRAM. We split it further at the - # compute unit level using CPU time and DRAM usage by the compute unit. For rest of - # of the power usage like network, storage, we split it equally among all compute units - # that running on the node at a given time. - # - # Compute Unit CPU Power = Total CPU Power * (Compute CPU Time / Total CPU Time) - # Compute Unit CPU Memory Power = Total CPU DRAM Power * (Compute Unit Memory / Total Memory) - # Misc Power Usage by Compute Unit = 0.1 * Total Power / Number of Compute Units - # - # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit - # - - record: uuid:ceems_host_power_watts:pue - expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job=""} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. - * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - / - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) - ) - ) - * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power - ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) - ) - + - 0.9 * instance:ceems_redfish_current_watts:pue{job=""} - * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power - ( - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) - / - ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) - ) - ) - * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power - ( - ceems_compute_unit_memory_used_bytes{job=""} - / on (instance) group_left () - ( - ceems_meminfo_MemTotal_bytes{job=""} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job=""} - ) - ) - + - 0.1 * instance:ceems_redfish_current_watts:pue{job=""} # Total Misc Power Usage - * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit - ( - ceems_compute_unit_memory_used_bytes{job=""} - / - ( - ceems_compute_unit_memory_used_bytes{job=""} - * on (instance) group_left () - ceems_compute_units{job=""} - ) > 0 - ) - - # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR - - record: uuid:ceems_host_emissions_g_s:pue - expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) - ) - ) - * - 100 - - # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - - record: job:ceems_host_power_watts:pue - expr: |2 - sum by (job) (1 * (label_replace(ceems_redfish_current_watts{job="",chassis=""}, "instancehost", "$1", "instance", "([^:]+):\\d+") - on (instancehost) group_left () sum by (instancehost) (label_replace(amg_gpu_power_watts{job=""} / 1e6, "instancehost", "$1", "instance", "([^:]+):\\d+"))) > 0) - - # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs - # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - - record: job:ceems_host_emissions_g_s:pue - expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) - ) diff --git a/etc/prometheus/rules/host-power-redfish.rules b/etc/prometheus/rules/host-power-redfish.rules index d1803956..fe311022 100644 --- a/etc/prometheus/rules/host-power-redfish.rules +++ b/etc/prometheus/rules/host-power-redfish.rules @@ -1,47 +1,39 @@ --- -# Recording rules for scrape job +# Recording rules for compute unit host power usage reported by Redfish. # -# The following recording rules provide several CPU related metrics of the individual -# compute units. Each of these metrics involving multiple raw metrics to compute them. -# Performing such queries involving multiple metrics is a computational intensive -# operation for Prometheus and hence, we leverage recording rules to estimate them -# in the real time and store them in the TSDB. The downside of this approach is that -# it creates new metrics which consume more space. However, we add atmost 10 new metrics -# which should not increase the disk use of TSDB enormously. +# The following rules estimate the power usage of compute units and their equivalent +# emissions when Redfish power usage is available on the nodes. # -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host -# : Chassis name of Redfish which reports the host power usage alone (excluding GPUs) +# Redfish organize the hardware into different chassis and reports the power usage of +# different chassis components. Depending on the server type, we should only include +# the chassis that report the host power usage excluding any GPUs. +# +# DEFAULT RULES TAKE POWER SUMMED OVER ALL CHASSIS. RULES NEEDS TO BE APPROPRIATELY +# MODIFIED BASED ON AVAILABLE CHASSIS. +# +# Optional placeholders to replace: +# +# : PUE value # : Evaluation interval -# : Rate interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). # groups: - - name: compute-unit-rules- - interval: + - name: redfish-power-usage-rules + # interval: rules: - # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. - - record: uuid:ceems_cpu_usage:ratio_irate - expr: |2 - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) * 100 - / - (ceems_compute_unit_cpus{job=""} > 0) - - # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to - # the available memory to the compute unit. - - record: uuid:ceems_cpu_memory_usage:ratio - expr: |2 - ceems_compute_unit_memory_used_bytes{job=""} * 100 - / - (ceems_compute_unit_memory_total_bytes{job=""} > 0) - # Total power (Watts) consumed by the instance by accounting Power Usage Effectiveness (PUE) value. - - record: instance:ceems_redfish_current_watts:pue - expr: 1 * ceems_redfish_current_watts{job="",chassis=""} + # To use selective chassis components, replace .* in the rule by chassis regex, eg, "Chassis_1|Chassis_2" + - record: instance:ceems_redfish_power_current_watts:pue + labels: + ceemspowersources: redfish-rapl + expr: sum without (chassis) (ceems_redfish_power_current_watts{chassis=~".*"}) # * # Total host power (Watts) consumed by the compute unit accounting PUE value. # @@ -73,131 +65,121 @@ groups: # Total Compute Unit Host Power = Compute Unit CPU Power + Compute Unit CPU Memory Power + Misc Power Usage by Compute Unit # - record: uuid:ceems_host_power_watts:pue + labels: + ceemspowersources: redfish-rapl expr: |2 - 0.9 * instance:ceems_redfish_current_watts:pue{job=""} # Assumption 90% Power usage by CPU, CPU memory and other peripherals. + 0.9 * instance:ceems_redfish_power_current_watts:pue # Assumption 90% Power usage by CPU, CPU memory and other peripherals. * on (instance) group_left () # 0.9 * Total Power * (RAPL Package / (RAPL Package + RAPL DRAM)) -> Total CPU Power ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - / ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) ) - ) + > + 0 + or + sum by (instance) ( + instance:ceems_redfish_power_current_watts:pue / instance:ceems_redfish_power_current_watts:pue + ) + ) * on (instance) group_right () # Total CPU Power * (Compute CPU Time / Total CPU Time) -> Compute Unit CPU Power ( - ( - irate(ceems_compute_unit_cpu_user_seconds_total{job=""}[]) - + - irate(ceems_compute_unit_cpu_system_seconds_total{job=""}[]) - ) - / on (instance) group_left () - sum by (instance) (irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[])) + ( + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) + / on (instance) group_left () + sum by (instance) (irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m])) + ) + > + 0 + or on (instance) + ceems_compute_unit_cpu_user_seconds_total * 0 ) + - 0.9 * instance:ceems_redfish_current_watts:pue{job=""} + 0.9 * instance:ceems_redfish_power_current_watts:pue * on (instance) group_left () # 0.9 * Total Power * (RAPL DRAM / (RAPL Package + RAPL DRAM)) -> Total CPU Memory Power ( - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) - / ( - sum by (instance) (irate(ceems_rapl_package_joules_total{job=""}[])) - + - sum by (instance) (irate(ceems_rapl_dram_joules_total{job=""}[])) + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + / + ( + sum by (instance) (irate(ceems_rapl_package_joules_total[1m])) + + + sum by (instance) (irate(ceems_rapl_dram_joules_total[1m])) + ) ) - ) + > + 0 + or + sum by (instance) (instance:ceems_redfish_power_current_watts:pue * 0) + ) * on (instance) group_right () # Total CPU Memory Power * (Compute Unit Memory / Total Memory) -> Compute Unit CPU Memory Power ( - ceems_compute_unit_memory_used_bytes{job=""} - / on (instance) group_left () - ( - ceems_meminfo_MemTotal_bytes{job=""} - - on (instance) - ceems_meminfo_MemAvailable_bytes{job=""} - ) + ( + ceems_compute_unit_memory_used_bytes + / on (instance) group_left () + (ceems_meminfo_MemTotal_bytes - on (instance) ceems_meminfo_MemAvailable_bytes) + ) + > + 0 + or on (instance) + ceems_compute_unit_memory_used_bytes * 0 ) + - 0.1 * instance:ceems_redfish_current_watts:pue{job=""} # Total Misc Power Usage + 0.1 * instance:ceems_redfish_power_current_watts:pue # Total Misc Power Usage * on (instance) group_right () # Total Misc Power usage / Number of Compute Units -> Misc Power Usage by Compute Unit ( - ceems_compute_unit_memory_used_bytes{job=""} + ceems_compute_unit_memory_used_bytes / ( - ceems_compute_unit_memory_used_bytes{job=""} + ceems_compute_unit_memory_used_bytes * on (instance) group_left () - ceems_compute_units{job=""} + ceems_compute_units ) > 0 ) # Total equivalent emissions rate (g/s) due to the power consumed by the compute unit. - # The equivalent emissions are estimated using emission factor from owid for country - # FR + # The equivalent emissions are estimated using emission factor from owid for country FR - record: uuid:ceems_host_emissions_g_s:pue expr: |2 - label_replace( - uuid:ceems_host_power_watts:pue{job=""} / 3.6e+06, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"} - - - # The following recording rules estimate the average CPU, CPU memory usages and - # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-rules- - interval: - rules: - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. - - record: job:ceems_cpu_usage:avg_ratio_irate - expr: |2 - avg by (job) ( - ( - sum by (job, instance) ( - irate(ceems_cpu_seconds_total{job="",mode!~"idle|iowait|steal"}[]) - ) - * - 100 - / on (instance) group_left () - ((ceems_cpu_count{job=""} > 0) / ceems_cpu_per_core_count{job=""}) - ) - ) - - # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to - # the available memory to the host. - - record: job:ceems_cpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - ( - 1 - - - (ceems_meminfo_MemAvailable_bytes{job=""} / ceems_meminfo_MemTotal_bytes{job=""}) - ) + ( + label_replace(uuid:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "redfish-rapl", + "instance", + "(.*)" ) - ) - * - 100 + ) # Total power (Watts) consumed by all hosts excluding GPUs in a Prometheus job accounting PUE value. - record: job:ceems_host_power_watts:pue - expr: |2 - sum by (job) (1 * ceems_redfish_current_watts{job="",chassis="Chassis_1"}) + labels: + ceemspowersources: redfish-rapl + expr: sum by (job) (instance:ceems_redfish_power_current_watts:pue) # Total equivalent emissions rate (g/s) due to the power consumed by all ths hosts excluding GPUs # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - record: job:ceems_host_emissions_g_s:pue expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_host_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) + ( + label_replace(job:ceems_host_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "redfish-rapl", + "instance", + "(.*)" + ) ) diff --git a/etc/prometheus/rules/host-usage.rules b/etc/prometheus/rules/host-usage.rules new file mode 100644 index 00000000..54b30e33 --- /dev/null +++ b/etc/prometheus/rules/host-usage.rules @@ -0,0 +1,72 @@ +--- +# Recording rules for compute unit host CPU and memory usage +# +# The rules estimate the compute unit's CPU and CPU memory usage +# metrics +# +# Optional placeholders to replace: +# +# : Evaluation interval +# +# By default rate interval of 1m is used. For scrape intervals more than 30s, use a bigger +# rate interval. +# +groups: + - name: host-cpu-mem-usage-rules + # interval: + rules: + # CPU usage (%) of compute unit. It is percentage of CPU cycles spent by the compute unit. + - record: uuid:ceems_cpu_usage:ratio_irate + expr: |2 + ( + irate(ceems_compute_unit_cpu_user_seconds_total[1m]) + + + irate(ceems_compute_unit_cpu_system_seconds_total[1m]) + ) * 100 + / + (ceems_compute_unit_cpus > 0) + + # CPU memory usage (%) of compute unit. It is percentage of CPU memory used by compute unit relative to + # the available memory to the compute unit. + - record: uuid:ceems_cpu_memory_usage:ratio + expr: |2 + ceems_compute_unit_memory_used_bytes * 100 + / + (ceems_compute_unit_memory_total_bytes > 0) + + # The following recording rules estimate the average CPU, CPU memory usages and + # total host power (excluding GPUs) and its equivalent emissions aggregared for all hosts + # per Prometheus job. + - name: host-agg-cpu-usage-rules + # interval: + rules: + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU cycles spent by the host. + - record: job:ceems_cpu_usage:avg_ratio_irate + expr: |2 + avg by (job) ( + ( + sum by (job, instance) ( + irate(ceems_cpu_seconds_total{mode!~"idle|iowait|steal"}[1m]) + ) + * + 100 + / on (instance) group_left () + ((ceems_cpu_count > 0) / ceems_cpu_per_core_count) + ) + ) + + # Average CPU usage (%) of all hosts in a Prometheus job. It is percentage of CPU memory used by host relative to + # the available memory to the host. + - record: job:ceems_cpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( + ( + ( + 1 + - + (ceems_meminfo_MemAvailable_bytes / ceems_meminfo_MemTotal_bytes) + ) + ) + ) + * + 100 diff --git a/etc/prometheus/rules/nvidia-dcgm-gpu.rules b/etc/prometheus/rules/nvidia-dcgm-gpu.rules new file mode 100644 index 00000000..9bcbb073 --- /dev/null +++ b/etc/prometheus/rules/nvidia-dcgm-gpu.rules @@ -0,0 +1,264 @@ +--- +# Recording rules for NVIDIA GPUs using DCGM exporter. +# Rules assume that all the profiling metrics are enabled. +# +# These rules provide NVIDIA GPU metrics fetched from +# DCGM exporter (https://docs.nvidia.com/datacenter/dcgm/latest/gpu-telemetry/dcgm-exporter.html) +# for each compute unit. +# +# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power +# estimation as well. +# +# Optional placeholders to replace: +# +# : PUE value +# : Evaluation interval +# +# By default emissions are estimated using OWID for France. In order to change +# them replace `owid` with appropriate emissions provider supported by CEEMS exporter +# and replace `country_code` with appropriate country code. More details in CEEMS +# exporter docs (https://ceems-dev.github.io/ceems/docs/components/ceems-exporter#emissions-collector). +# +groups: + - name: nvidia-dcgm-gpu-usage-rules + # interval: + rules: + # GPU Usage (%) by compute unit + # Prefer DCGM_FI_PROF_GR_ENGINE_ACTIVE to DCGM_FI_DEV_GPU_UTIL + # when available + # Ref: https://github.com/NVIDIA/DCGM/issues/64#issuecomment-1400811885 + # Ref: https://github.com/NVIDIA/DCGM/issues/80#issuecomment-1537603016 + - record: uuid:ceems_gpu_usage:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # GPU Memory Usage (%) by compute unit + - record: uuid:ceems_gpu_memory_usage:ratio + expr: |2 + ( + label_replace(label_replace(label_replace(DCGM_FI_DEV_FB_USED, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + / + ( + label_replace(label_replace(label_replace(DCGM_FI_DEV_FB_USED, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + + label_replace(label_replace(label_replace(DCGM_FI_DEV_FB_FREE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + ) + ) + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # Average GPU Usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_usage:avg + expr: avg by (job) (DCGM_FI_DEV_GPU_UTIL) + + # Average GPU memory usage (%) for all hosts aggregated per Prometheus job + - record: job:ceems_gpu_memory_usage:avg_ratio + expr: |2 + avg by (job) ( + ( + DCGM_FI_DEV_FB_USED * 100 + / + (DCGM_FI_DEV_FB_USED + DCGM_FI_DEV_FB_FREE) + ) + ) + + - name: nvidia-dcgm-gpu-power-usage-rules + # interval: + rules: + # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. + - record: dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue + labels: + ceemspowersources: nvidia-dcgm + expr: label_replace(label_replace(label_replace(DCGM_FI_DEV_POWER_USAGE_INSTANT, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") # * + + # When profiling metrics are available, we split the total power consumed by physical + # GPU among all MIG instances based on "effective" SM usage on each MIG instance. This + # is estimated as TotalPower * number of SMs * Mean SM usage * SM Occupancy / sum(number of SMs * Mean SM usage * SM Occupancy) + # + # When profiling metrics are not available, we split the total power based on number of SMs + # available in that instance compared to total number of SMs being used in the physical GPU + - record: uuid:ceems_gpu_power_watts:pue + labels: + ceemspowersources: nvidia-dcgm + expr: |2 + ( + ( + ( + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid, gpuiid,hostname) group_right () + ceems_compute_unit_gpu_sm_count + ) + ) + / on (gpuuuid,hostname) group_left () + ( + sum by (gpuuuid,hostname) ( + ( + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_sm_count + ) + > + 0 + ) + ) + ) + * on (gpuuuid,gpuiid,hostname) group_left () + dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue + * on (gpuuuid,gpuiid,hostname,uuid) group_right () + ceems_compute_unit_gpu_index_flag + + # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. + # The equivalent emissions are estimated using emission factor from owid for country FR + - record: uuid:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(uuid:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "nvidia-dcgm", + "instance", + "(.*)" + ) + ) + + # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job + - record: job:ceems_gpu_power_watts:pue + labels: + ceemspowersources: nvidia-dcgm + expr: sum by (job)(dev:DCGM_FI_DEV_POWER_USAGE_INSTANT:pue) + + # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts + # in a Prometheus job accounting PUE value. + - record: job:ceems_gpu_emissions_g_s:pue + expr: |2 + ( + label_replace(job:ceems_gpu_power_watts:pue / 3.6e+06, "provider", "owid", "instance", "(.*)") + * on (ceemspowersources) group_left () + label_replace( + ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, + "ceemspowersources", + "nvidia-dcgm", + "instance", + "(.*)" + ) + ) + + # Profiling metrics + - name: nvidia-dcgm-gpu-profiling-rules + # interval: + rules: + - record: uuid:ceems_gpu_prof_sm_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_sm_occupancy:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_SM_OCCUPANCY, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_gr_engine_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_GR_ENGINE_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pipe_tensor_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pipe_fp64_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PIPE_FP64_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pipe_fp32_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PIPE_FP32_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pipe_fp16_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PIPE_FP16_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_dram_active:ratio + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_DRAM_ACTIVE, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") * 100 + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_NVLINK_TX_BYTES, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_nvlink_rx_bytes:rate + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_NVLINK_RX_BYTES, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pcie_tx_bytes:rate + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PCIE_TX_BYTES, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + - record: uuid:ceems_gpu_prof_pcie_rx_bytes:rate + expr: |2 + label_replace(label_replace(label_replace(DCGM_FI_PROF_PCIE_RX_BYTES, "gpuuuid", "$1", "UUID", "(.*)"), "gpuiid", "$1", "GPU_I_ID", "(.*)"), "hostname", "$1", "Hostname", "(.*)") + * on (gpuuuid,gpuiid,hostname) group_right () + ceems_compute_unit_gpu_index_flag + + # Average usage for all hosts aggregated by Prometheus job + - record: job:ceems_gpu_prof_sm_active:ratio + expr: avg by (job) (DCGM_FI_PROF_SM_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_sm_occupancy:ratio + expr: avg by (job) (DCGM_FI_PROF_SM_OCCUPANCY) * 100 + + - record: job:ceems_gpu_prof_gr_engine_active:ratio + expr: avg by (job) (DCGM_FI_PROF_GR_ENGINE_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_pipe_tensor_active:ratio + expr: avg by (job) (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_pipe_fp64_active:ratio + expr: avg by (job) (DCGM_FI_PROF_PIPE_FP64_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_pipe_fp32_active:ratio + expr: avg by (job) (DCGM_FI_PROF_PIPE_FP32_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_pipe_fp16_active:ratio + expr: avg by (job) (DCGM_FI_PROF_PIPE_FP16_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_dram_active:ratio + expr: avg by (job) (DCGM_FI_PROF_DRAM_ACTIVE) * 100 + + - record: job:ceems_gpu_prof_nvlink_tx_bytes:rate + expr: avg by (job) (DCGM_FI_PROF_NVLINK_TX_BYTES) + + - record: job:ceems_gpu_prof_nvlink_rx_bytes:rate + expr: avg by (job) (DCGM_FI_PROF_NVLINK_RX_BYTES) + + - record: job:ceems_gpu_prof_pcie_tx_bytes:rate + expr: avg by (job) (DCGM_FI_PROF_PCIE_TX_BYTES) + + - record: job:ceems_gpu_prof_pcie_rx_bytes:rate + expr: avg by (job) (DCGM_FI_PROF_PCIE_RX_BYTES) diff --git a/etc/prometheus/rules/nvidia-gpu.rules b/etc/prometheus/rules/nvidia-gpu.rules deleted file mode 100644 index 0ee5310b..00000000 --- a/etc/prometheus/rules/nvidia-gpu.rules +++ /dev/null @@ -1,129 +0,0 @@ ---- -# Recording rules for NVIDIA GPUs scrape job . -# -# These rules map the GPU usage to the compute unit `uuid` which gives -# GPU metrics for each compute unit. -# -# We leverage these rules to include PUE (Power Usage Effectiveness) in the Power -# estimation as well. -# -# Placeholders to replace: -# : Prometheus job name -# : Prometheus job name under which DCGM exporter is running on the same host -# : Evaluation interval -# : Rate interval -# -groups: - - name: compute-unit-gpu-rules- - interval: - rules: - # GPU Usage (%) by compute unit - - record: uuid:ceems_gpu_usage:ratio - expr: |2 - DCGM_FI_DEV_GPU_UTIL{job=""} - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - # GPU Memory Usage (%) by compute unit - - record: uuid:ceems_gpu_memory_usage:ratio - expr: |2 - ( - DCGM_FI_DEV_FB_USED{job=""} * 100 - / - (DCGM_FI_DEV_FB_USED{job=""} + DCGM_FI_DEV_FB_FREE{job=""}) - ) - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - # Total power (Watts) consumed by the GPU by accounting Power Usage Effectiveness (PUE) value. - - record: dev:DCGM_FI_DEV_POWER_USAGE:pue - expr: 1 * DCGM_FI_DEV_POWER_USAGE{job=""} - - - record: uuid:ceems_gpu_power_watts:pue - expr: |2 - dev:DCGM_FI_DEV_POWER_USAGE:pue{job=""} - * on (gpuuuid,gpuiid) group_right() - ceems_compute_unit_gpu_index_flag{job=""} - - # Total equivalent emissions rate (g/s) from GPU due to the power consumed by the compute unit's GPUs. - # The equivalent emissions are estimated using emission factor from owid for country - # FR - - record: uuid:ceems_gpu_emissions_g_s:pue - expr: |2 - label_replace( - dev:DCGM_FI_DEV_POWER_USAGE:pue{job=""} / 3.6e+06 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""}, - "provider", - "owid", - "instance", - "(.*)" - ) - * on (provider) group_left () - label_replace( - ceems_emissions_gCo2_kWh{country_code="FR",provider="owid"}, - "common_label", - "mock", - "instance", - "(.*)" - ) - - - # Profiling metrics - - record: uuid:ceems_gpu_sm_active:ratio - expr: |2 - DCGM_FI_PROF_SM_ACTIVE{job=""} * 100 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - - record: uuid:ceems_gpu_sm_occupancy:ratio - expr: |2 - DCGM_FI_PROF_SM_OCCUPANCY{job=""} * 100 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - - record: uuid:ceems_gpu_gr_engine_active:ratio - expr: |2 - DCGM_FI_PROF_GR_ENGINE_ACTIVE{job=""} * 100 - * on (gpuuuid,gpuiid) group_right () - ceems_compute_unit_gpu_index_flag{job=""} - - - # The following recording rules estimate the average GPU, GPU memory usages and - # total GPU power and its equivalent emissions aggregared for all hosts - # per Prometheus job. - # - - name: host-agg-gpu-rules- - interval: - rules: - # Average GPU Usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_usage:avg - expr: avg by (job) (DCGM_FI_DEV_GPU_UTIL{job=""}) - - # Average GPU memory usage (%) for all hosts aggregated per Prometheus job - - record: job:ceems_gpu_memory_usage:avg_ratio - expr: |2 - avg by (job) ( - ( - DCGM_FI_DEV_FB_USED{job=""} * 100 - / - (DCGM_FI_DEV_FB_USED{job=""} + DCGM_FI_DEV_FB_FREE{job=""}) - ) - ) - - # Total power usage (Watts) by GPUs on all hosts aggregated per Prometheus job - - record: job:ceems_gpu_power_watts:pue - expr: sum by (job)(1 * DCGM_FI_DEV_POWER_USAGE{job=""}) - - # Total equivalent emissions rate (g/s) due to the power consumed by GPUs on all ths hosts - # in a Prometheus job accounting PUE value. - # The equivalent emissions are estimated for country FR - - record: job:ceems_gpu_emissions_g_s:pue - expr: |2 - sum by (job, country_code, country, provider) ( - ( - job:ceems_gpu_power_watts:pue{job=""} / 3.6e+06 - * on (job) group_right () - label_replace(ceems_emissions_gCo2_kWh, "job", "", "instance", "(.*)") - ) - ) diff --git a/pkg/collector/cray_pm_counters.go b/pkg/collector/cray_pm_counters.go index d9dc64d1..e7bb4922 100644 --- a/pkg/collector/cray_pm_counters.go +++ b/pkg/collector/cray_pm_counters.go @@ -20,6 +20,12 @@ import ( const crayPMCCollectorSubsystem = "cray_pm_counters" +// Only used in demo instances and tests. +var crayPMCSysPath = CEEMSExporterApp.Flag( + "collector.cray_pm_counters.path.sysfs", + "sysfs mountpoint for Cray PMC collector", +).Hidden().Default("").String() + // Currently supported PM counters domains. var ( pmcDomainRegex = regexp.MustCompile("((?:cpu|memory|accel)[0-9]*?)*?_*?(energy|power|temp)_*?(cap)*?") @@ -41,7 +47,12 @@ func init() { // NewCrayPMCCollector returns a new Collector exposing Cray's `pm_counters` metrics. func NewCrayPMCCollector(logger *slog.Logger) (Collector, error) { - fs, err := sysfs.NewFS(*sysPath) + sysFSPath := *sysPath + if *crayPMCSysPath != "" { + sysFSPath = *crayPMCSysPath + } + + fs, err := sysfs.NewFS(sysFSPath) if err != nil { return nil, err } diff --git a/pkg/collector/rapl.go b/pkg/collector/rapl.go index 1fdb6a99..906584b9 100644 --- a/pkg/collector/rapl.go +++ b/pkg/collector/rapl.go @@ -42,17 +42,30 @@ type raplCountersSecurityCtxData struct { } func init() { - RegisterCollector(raplCollectorSubsystem, defaultEnabled, NewRaplCollector) + RegisterCollector(raplCollectorSubsystem, defaultDisabled, NewRaplCollector) } -var raplZoneLabel = CEEMSExporterApp.Flag( - "collector.rapl.enable-zone-label", - "Enables RAPL zone labels (default: disabled)", -).Default("false").Bool() +var ( + raplZoneLabel = CEEMSExporterApp.Flag( + "collector.rapl.enable-zone-label", + "Enables RAPL zone labels (default: disabled)", + ).Default("false").Bool() + + // Only used in demo instances and tests. + raplSysPath = CEEMSExporterApp.Flag( + "collector.rapl.path.sysfs", + "sysfs mountpoint for RAPL collector", + ).Hidden().Default("").String() +) // NewRaplCollector returns a new Collector exposing RAPL metrics. func NewRaplCollector(logger *slog.Logger) (Collector, error) { - fs, err := sysfs.NewFS(*sysPath) + sysFSPath := *sysPath + if *raplSysPath != "" { + sysFSPath = *raplSysPath + } + + fs, err := sysfs.NewFS(sysFSPath) if err != nil { return nil, err } diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 51d5c563..868bc655 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -41,6 +41,15 @@ do esac done +# Kill any existing servers from previous tests +pkill mock_exporters || true +pkill mock_servers || true +pkill ceems_exporter || true +pkill ceems_api_server || true +pkill ceems_lb || true +pkill redfish_proxy || true +pkill ceems_k8s_admission_controller || true + if [[ "${scenario}" =~ ^"exporter" ]] then # cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) @@ -528,6 +537,7 @@ then --collector.redfish.web-config="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ --collector.cray_pm_counters \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -548,6 +558,7 @@ then --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ --collector.ipmi.test-mode \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -574,6 +585,7 @@ then --collector.redfish.config.file.expand-env-vars \ --collector.netdev \ --collector.netdev.device-include="eth0" \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -595,6 +607,7 @@ then --collector.ipmi \ --collector.ipmi.test-mode \ --collector.cray_pm_counters \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -616,6 +629,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -633,6 +647,7 @@ then --collector.ipmi \ --collector.ipmi.test-mode \ --collector.infiniband \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -651,6 +666,7 @@ then --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/ipmiutils/ipmiutil" \ --collector.ipmi.test-mode \ --collector.cray_pm_counters \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -675,6 +691,7 @@ then --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ --collector.cray_pm_counters \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -696,6 +713,7 @@ then --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ --collector.ipmi.test-mode \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -720,6 +738,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -745,6 +764,7 @@ then --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ --collector.empty-hostname-label \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -763,6 +783,7 @@ then --collector.ipmi \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ --collector.ipmi.test-mode \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -785,6 +806,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -803,6 +825,7 @@ then --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ --collector.ipmi.test-mode \ --collector.empty-hostname-label \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -820,6 +843,7 @@ then --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/capmc/capmc" \ --collector.ipmi.test-mode \ --collector.empty-hostname-label \ + --collector.rapl \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ --log.level="debug" > "${logfile}" 2>&1 & @@ -839,6 +863,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -855,6 +880,7 @@ then --collector.k8s \ --collector.k8s.kube-config-file="pkg/collector/testdata/k8s/kubeconfig.yml" \ --collector.k8s.kubelet-socket-file="${CEEMS_KUBELET_SOCKET_DIR}/amd/kubelet.sock" \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -874,6 +900,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:${port}" \ --web.disable-exporter-metrics \ @@ -1504,6 +1531,9 @@ then ./bin/mock_servers redfish > /dev/null 2>&1 & MOCK_REDFISH_PID=$! + # Check rules provided in repo + find etc/prometheus -name "*.rules" | xargs -I {} promtool check rules {} >> "${logfile}" 2>&1 + waitport "5000" ./bin/mock_exporters test-mode dcgm amd-smi amd-device-metrics > /dev/null 2>&1 & @@ -1524,6 +1554,7 @@ then --collector.ipmi \ --collector.ipmi.test-mode \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/freeipmi/ipmi-dcmi" \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9010" \ --web.disable-exporter-metrics \ @@ -1538,6 +1569,7 @@ then --collector.cgroups.force-version="v1" \ --collector.slurm \ --collector.gpu.type="nogpu" \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9011" \ --web.disable-exporter-metrics \ @@ -1555,6 +1587,7 @@ then --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9012" \ --web.disable-exporter-metrics \ @@ -1568,11 +1601,13 @@ then --path.procfs="pkg/collector/testdata/proc" \ --collector.cgroups.force-version="v1" \ --collector.slurm \ + --collector.slurm.gres-config-file="pkg/collector/testdata/gres.conf" \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ --collector.redfish \ --collector.redfish.config.file="pkg/collector/testdata/redfish/config.yml" \ --collector.redfish.config.file.expand-env-vars \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9013" \ --web.disable-exporter-metrics \ @@ -1589,28 +1624,48 @@ then --collector.gpu.type="amd" \ --collector.gpu.rocm-smi-path="pkg/collector/testdata/rocm-smi" \ --collector.cray_pm_counters \ + --collector.rapl \ --collector.empty-hostname-label \ --web.listen-address "127.0.0.1:9014" \ --web.disable-exporter-metrics \ --log.level="debug" > /dev/null 2>&1 & MOCK_EXPORTER5_PID=$! + # Hwmon available + ./bin/ceems_exporter \ + --path.sysfs="pkg/collector/testdata/sys" \ + --path.cgroupfs="pkg/collector/testdata/sys/fs/cgroup" \ + --path.procfs="pkg/collector/testdata/proc" \ + --collector.cgroups.force-version="v1" \ + --collector.slurm \ + --collector.gpu.type="amd" \ + --collector.gpu.rocm-smi-path="pkg/collector/testdata/rocm-smi" \ + --collector.hwmon \ + --collector.rapl \ + --collector.empty-hostname-label \ + --web.listen-address "127.0.0.1:9015" \ + --web.disable-exporter-metrics \ + --log.level="debug" > /dev/null 2>&1 & + MOCK_EXPORTER6_PID=$! + # Only IPMI available (No RAPL). IPMI includes GPU power ./bin/ceems_exporter \ --path.cgroupfs="pkg/collector/testdata/sys/fs/cgroup" \ --path.procfs="pkg/collector/testdata/proc" \ --collector.cgroups.force-version="v1" \ --collector.slurm \ + --collector.slurm.gres-config-file="pkg/collector/testdata/gres.conf" \ --collector.gpu.type="nvidia" \ --collector.gpu.nvidia-smi-path="pkg/collector/testdata/nvidia-smi" \ --collector.ipmi \ --collector.ipmi.test-mode \ --collector.ipmi.dcmi.cmd="pkg/collector/testdata/ipmi/openipmi/ipmitool" \ + --collector.rapl \ --collector.empty-hostname-label \ - --web.listen-address "127.0.0.1:9015" \ + --web.listen-address "127.0.0.1:9016" \ --web.disable-exporter-metrics \ --log.level="debug" > /dev/null 2>&1 & - MOCK_EXPORTER6_PID=$! + MOCK_EXPORTER7_PID=$! # Emissions target ./bin/ceems_exporter \ @@ -1618,10 +1673,10 @@ then --collector.emissions \ --collector.emissions.provider=owid \ --collector.empty-hostname-label \ - --web.listen-address "127.0.0.1:9016" \ + --web.listen-address "127.0.0.1:9017" \ --web.disable-exporter-metrics \ --log.level="debug" > /dev/null 2>&1 & - MOCK_EXPORTER7_PID=$! + MOCK_EXPORTER8_PID=$! waitport "9010" waitport "9011" @@ -1630,6 +1685,7 @@ then waitport "9014" waitport "9015" waitport "9016" + waitport "9017" prometheus \ --config.file cmd/ceems_tool/testdata/prometheus.yml \ @@ -1646,6 +1702,10 @@ then # Sleep a while for Prometheus to scrape metrics sleep 30 + # Get series from rules + ./scripts/extract_query_value "job series" $(curl -gs 'http://localhost:9090/api/v1/query?' --data-urlencode 'query=count({__name__=~"job:(.*)"})') >> "${fixture_output}" + ./scripts/extract_query_value "uuid series" $(curl -gs 'http://localhost:9090/api/v1/query?' --data-urlencode 'query=count({__name__=~"uuid:(.*)"})') >> "${fixture_output}" + echo "0" | ./bin/ceems_tool tsdb create-recording-rules --country-code=FR --output-dir "${tmpdir}/rules" >> "${logfile}" 2>&1 # Add content of each recording file to fixture_output @@ -1670,7 +1730,7 @@ then done # Rules with static emission factor - echo "1" | ./bin/ceems_tool tsdb create-recording-rules --emission-factor=50 --output-dir "${tmpdir}/rules" >> "${logfile}" 2>&1 + echo "0" | ./bin/ceems_tool tsdb create-recording-rules --emission-factor=50 --output-dir "${tmpdir}/rules" >> "${logfile}" 2>&1 # Add content of each recording file to fixture_output find "${tmpdir}/rules" -type f -print0 | sort -z | while IFS= read -r -d $'\0' file; do @@ -1690,8 +1750,14 @@ then # Kill and restart Prom to sync data kill -9 "${PROMETHEUS_PID}" + # Make a copy of Prometheus config + cp cmd/ceems_tool/testdata/prometheus.yml "${tmpdir}/prometheus.yml" + + # Ignore existing recording rules + sed -i 's/prometheus/prometheus1/g' "${tmpdir}/prometheus.yml" + prometheus \ - --config.file cmd/ceems_tool/testdata/prometheus.yml \ + --config.file "${tmpdir}/prometheus.yml" \ --storage.tsdb.retention.time 10y \ --storage.tsdb.path "${tmpdir}/tsdb" \ --log.level="debug" >> "${logfile}" 2>&1 & diff --git a/scripts/extract_query_value b/scripts/extract_query_value new file mode 100755 index 00000000..397b93ac --- /dev/null +++ b/scripts/extract_query_value @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import sys +import json + + +def ordered(obj): + "Sort dict recursively" + if isinstance(obj, dict): + return sorted((k, ordered(v)) for k, v in obj.items()) + if isinstance(obj, list): + return sorted(ordered(x) for x in obj) + else: + return obj + + +def main(): + if len(sys.argv) < 3: + sys.exit('Two arguments needed') + + # Get prom query result + query_type = sys.argv[1] + query_result = sys.argv[2] + + # Read file contents into dicts + try: + qr = json.loads(query_result) + value = qr['data']['result'][0]['value'][1] + + print(f'Number of series found for {query_type} are: {value}') + except Exception as e: + print(e) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/mock_exporters/main.go b/scripts/mock_exporters/main.go index 7f56a91e..94f9bc5d 100644 --- a/scripts/mock_exporters/main.go +++ b/scripts/mock_exporters/main.go @@ -2,7 +2,6 @@ package main import ( "context" - "fmt" "log" "math/rand/v2" "net/http" @@ -37,6 +36,15 @@ type dcgmCollector struct { gpuSMActive *prometheus.Desc gpuSMOcc *prometheus.Desc gpuGREngActive *prometheus.Desc + gpuPipeActive *prometheus.Desc + gpuFP64Active *prometheus.Desc + gpuFP32Active *prometheus.Desc + gpuFP16Active *prometheus.Desc + gpuNVLTX *prometheus.Desc + gpuNVLRX *prometheus.Desc + gpuDRAMActive *prometheus.Desc + gpuPCIeRX *prometheus.Desc + gpuPCIeTX *prometheus.Desc } func randFloat(minVal, maxVal float64) float64 { @@ -45,11 +53,18 @@ func randFloat(minVal, maxVal float64) float64 { func newDCGMCollector() *dcgmCollector { devices := []Device{ - {"0", "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", "0", "00000000:15:00.0"}, - {"1", "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", "1", "00000000:15:00.0"}, - {"2", "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", "", "00000000:16:00.0"}, - {"3", "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", "", "00000000:17:00.0"}, - {"4", "GPU-1d4d0f3e-b51a-4040-96e3-bf380f7c5728", "", "00000000:18:00.0"}, + {"0", "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", "", "00000000:10:00.0"}, + {"1", "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", "", "00000000:15:00.0"}, + {"2", "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", "1", "00000000:21:00.0"}, + {"2", "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", "5", "00000000:21:00.0"}, + {"2", "GPU-956348bc-d43d-23ed-53d4-857749fa2b67", "13", "00000000:21:00.0"}, + {"3", "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", "1", "00000000:81:00.0"}, + {"3", "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", "5", "00000000:81:00.0"}, + {"3", "GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7", "6", "00000000:81:00.0"}, + {"4", "GPU-61a65011-6571-a6d2-5th8-66cbb6f7f9c3", "", "00000000:83:00.0"}, + {"5", "GPU-61a65011-6571-a64n-5ab8-66cbb6f7f9c3", "", "00000000:85:00.0"}, + {"6", "GPU-1d4d0f3e-b51a-4040-96e3-bf380f7c5728", "", "00000000:87:00.0"}, + {"7", "GPU-6cc98505-fdde-461e-a93c-6935fba45a27", "", "00000000:89:00.0"}, } return &dcgmCollector{ @@ -86,56 +101,172 @@ func newDCGMCollector() *dcgmCollector { "GPU GR engien active", []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, ), + gpuPipeActive: prometheus.NewDesc("DCGM_FI_PROF_PIPE_TENSOR_ACTIVE", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuFP64Active: prometheus.NewDesc("DCGM_FI_PROF_PIPE_FP64_ACTIVE", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuFP32Active: prometheus.NewDesc("DCGM_FI_PROF_PIPE_FP32_ACTIVE", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuFP16Active: prometheus.NewDesc("DCGM_FI_PROF_PIPE_FP16_ACTIVE", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuDRAMActive: prometheus.NewDesc("DCGM_FI_PROF_DRAM_ACTIVE", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuNVLTX: prometheus.NewDesc("DCGM_FI_PROF_NVLINK_TX_BYTES", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuNVLRX: prometheus.NewDesc("DCGM_FI_PROF_NVLINK_RX_BYTES", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuPCIeTX: prometheus.NewDesc("DCGM_FI_PROF_PCIE_TX_BYTES", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), + gpuPCIeRX: prometheus.NewDesc("DCGM_FI_PROF_PCIE_RX_BYTES", + "GPU GR engien active", + []string{"Hostname", "UUID", "GPU_I_ID", "device", "gpu", "pci_bus_id", "modelName"}, nil, + ), } } // Describe writes all descriptors to the prometheus desc channel. func (collector *dcgmCollector) Describe(ch chan<- *prometheus.Desc) { ch <- collector.gpuUtil + ch <- collector.gpuMemUsed + ch <- collector.gpuMemFree + ch <- collector.gpuPower + ch <- collector.gpuSMActive + ch <- collector.gpuSMOcc + ch <- collector.gpuGREngActive + + ch <- collector.gpuPipeActive + + ch <- collector.gpuFP64Active + + ch <- collector.gpuFP32Active + + ch <- collector.gpuFP16Active + + ch <- collector.gpuDRAMActive + + ch <- collector.gpuNVLRX + + ch <- collector.gpuNVLTX + + ch <- collector.gpuPCIeRX + + ch <- collector.gpuPCIeTX } // Collect implements required collect function for all promehteus collectors. func (collector *dcgmCollector) Collect(ch chan<- prometheus.Metric) { - for idev, dev := range collector.devices { + // Generate random power consumptions for physical devices + powerUsage := make(map[string]float64) + for _, dev := range collector.devices { + powerUsage[dev.ID] = randFloat(minNvGPUPower, maxNvGPUPower) + } + + for _, dev := range collector.devices { ch <- prometheus.MustNewConstMetric( collector.gpuUtil, prometheus.GaugeValue, 100*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( collector.gpuMemUsed, prometheus.GaugeValue, 100*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( collector.gpuMemFree, prometheus.GaugeValue, 100*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) - power := randFloat(minNvGPUPower, maxNvGPUPower) ch <- prometheus.MustNewConstMetric( - collector.gpuPower, prometheus.GaugeValue, power, "host", dev.UUID, dev.IID, - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + collector.gpuPower, prometheus.GaugeValue, powerUsage[dev.ID], "host", dev.UUID, dev.IID, + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( - collector.gpuPowerInst, prometheus.GaugeValue, power, "host", dev.UUID, dev.IID, - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + collector.gpuPowerInst, prometheus.GaugeValue, powerUsage[dev.ID], "host", dev.UUID, dev.IID, + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( collector.gpuSMActive, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( collector.gpuSMOcc, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) + ch <- prometheus.MustNewConstMetric( collector.gpuGREngActive, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec - fmt.Sprintf("nvidia%d", idev), strconv.Itoa(idev), dev.PCIAddr, "NVIDIA A100 80GiB", + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuPipeActive, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP64Active, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP32Active, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP16Active, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuDRAMActive, prometheus.GaugeValue, rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuNVLRX, prometheus.GaugeValue, 1024*1024*1024*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuNVLTX, prometheus.GaugeValue, 1024*1024*1024*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuPCIeTX, prometheus.GaugeValue, 1024*1024*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuPCIeRX, prometheus.GaugeValue, 1024*1024*rand.Float64(), "host", dev.UUID, dev.IID, //nolint:gosec + "nvidia"+dev.ID, dev.ID, dev.PCIAddr, "NVIDIA A100 80GiB", ) } } @@ -176,7 +307,9 @@ func newAMDSMICollector() *amdSMICollector { // Describe writes all descriptors to the prometheus desc channel. func (collector *amdSMICollector) Describe(ch chan<- *prometheus.Desc) { ch <- collector.gpuUtil + ch <- collector.gpuMemUtil + ch <- collector.gpuPower } @@ -187,13 +320,14 @@ func (collector *amdSMICollector) Collect(ch chan<- prometheus.Metric) { collector.gpuUtil, prometheus.GaugeValue, 100*rand.Float64(), strconv.Itoa(idev), //nolint:gosec "Advanced Micro Devices Inc", ) + ch <- prometheus.MustNewConstMetric( collector.gpuMemUtil, prometheus.GaugeValue, 100*rand.Float64(), strconv.Itoa(idev), //nolint:gosec "Advanced Micro Devices Inc", ) // GPU power reported in micro Watts ch <- prometheus.MustNewConstMetric( - collector.gpuPower, prometheus.GaugeValue, randFloat(minAMDGPUPower, maxAMDGPUPower), strconv.Itoa(idev), + collector.gpuPower, prometheus.GaugeValue, 1e6*randFloat(minAMDGPUPower, maxAMDGPUPower), strconv.Itoa(idev), "Advanced Micro Devices Inc", ) } @@ -201,69 +335,231 @@ func (collector *amdSMICollector) Collect(ch chan<- prometheus.Metric) { // AMD Device Metrics collector. type amdDeviceMetricsCollector struct { - devices []Device - gpuUtil *prometheus.Desc - gpuMemTotal *prometheus.Desc - gpuMemUsed *prometheus.Desc - gpuPower *prometheus.Desc + devices []Device + gpuUtil *prometheus.Desc + gpuVRAMTotal *prometheus.Desc + gpuVRAMUsed *prometheus.Desc + gpuGTTTotal *prometheus.Desc + gpuGTTUsed *prometheus.Desc + gpuVisibleRAMTotal *prometheus.Desc + gpuVisibleRAMUsed *prometheus.Desc + gpuPower *prometheus.Desc + gpuSMActive *prometheus.Desc + gpuProfOccupancy *prometheus.Desc + gpuTensorActive *prometheus.Desc + gpuFP64Ops *prometheus.Desc + gpuFP32Ops *prometheus.Desc + gpuFP16Ops *prometheus.Desc + gpuWriteSize *prometheus.Desc + gpuReadSize *prometheus.Desc } -func newAMDDeviceMetricsCollector() *amdDeviceMetricsCollector { +func newAMDDeviceMetricsCollector(prefix string) *amdDeviceMetricsCollector { devices := []Device{ - {"0", "20170000800c", "", "00000000:15:00.0"}, - {"1", "20170003580c", "", "00000000:16:00.0"}, - {"2", "20180003050c", "", "00000000:17:00.0"}, - {"3", "20170005280c", "", "00000000:18:00.0"}, + {"0", "20170000800c", "0", "00000000:15:00.0"}, + {"1", "20170003580c", "0", "00000000:16:00.0"}, + {"2", "20170003580c", "1", "00000000:16:00.0"}, + {"3", "20170003580c", "2", "00000000:16:00.0"}, + {"4", "20170003580c", "3", "00000000:16:00.0"}, + {"5", "20170003580c", "4", "00000000:16:00.0"}, + {"6", "20170003580c", "5", "00000000:16:00.0"}, + {"7", "20170003580c", "6", "00000000:16:00.0"}, + {"8", "20170003580c", "7", "00000000:16:00.0"}, + {"9", "20180003050c", "0", "00000000:17:00.0"}, + {"10", "20180003050c", "1", "00000000:17:00.0"}, + {"11", "20170005280c", "0", "00000000:18:00.0"}, } return &amdDeviceMetricsCollector{ devices: devices, - gpuUtil: prometheus.NewDesc("gpu_gfx_activity", + gpuUtil: prometheus.NewDesc(prefix+"gpu_gfx_activity", "GPU utilization", []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, ), - gpuMemTotal: prometheus.NewDesc("gpu_total_vram", - "GPU memory total", + gpuVRAMTotal: prometheus.NewDesc(prefix+"gpu_total_vram", + "GPU VRAM total", []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, ), - gpuMemUsed: prometheus.NewDesc("gpu_used_vram", - "GPU memory used", + gpuVRAMUsed: prometheus.NewDesc(prefix+"gpu_used_vram", + "GPU VRAM used", []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, ), - gpuPower: prometheus.NewDesc("gpu_power_usage", + gpuGTTTotal: prometheus.NewDesc(prefix+"gpu_total_gtt", + "GTT memory total", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuGTTUsed: prometheus.NewDesc(prefix+"gpu_used_gtt", + "GTT memory used", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuVisibleRAMTotal: prometheus.NewDesc(prefix+"gpu_total_visible_vram", + "Visible RAM memory total", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuVisibleRAMUsed: prometheus.NewDesc(prefix+"gpu_used_visible_vram", + "Visible RAM memory used", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuPower: prometheus.NewDesc(prefix+"gpu_package_power", "GPU power", []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, ), + gpuSMActive: prometheus.NewDesc(prefix+"gpu_prof_sm_active", + "GPU SM active", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuTensorActive: prometheus.NewDesc(prefix+"gpu_prof_tensor_active_percent", + "GPU SM active", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuProfOccupancy: prometheus.NewDesc(prefix+"gpu_prof_occupancy_percent", + "GPU occupancy", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuFP64Ops: prometheus.NewDesc(prefix+"gpu_prof_total_64_ops", + "GPU FP64 ops", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuFP32Ops: prometheus.NewDesc(prefix+"gpu_prof_total_32_ops", + "GPU FP64 ops", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuFP16Ops: prometheus.NewDesc(prefix+"gpu_prof_total_16_ops", + "GPU FP64 ops", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuWriteSize: prometheus.NewDesc(prefix+"gpu_prof_write_size", + "GPU FP64 ops", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), + gpuReadSize: prometheus.NewDesc(prefix+"gpu_prof_fetch_size", + "GPU FP64 ops", + []string{"gpu_id", "gpu_partition_id", "serial_number"}, nil, + ), } } // Describe writes all descriptors to the prometheus desc channel. func (collector *amdDeviceMetricsCollector) Describe(ch chan<- *prometheus.Desc) { ch <- collector.gpuUtil - ch <- collector.gpuMemTotal - ch <- collector.gpuMemUsed + + ch <- collector.gpuVRAMTotal + + ch <- collector.gpuVRAMTotal + + ch <- collector.gpuGTTTotal + + ch <- collector.gpuGTTUsed + + ch <- collector.gpuVisibleRAMTotal + + ch <- collector.gpuVisibleRAMUsed + ch <- collector.gpuPower + + ch <- collector.gpuSMActive + + ch <- collector.gpuProfOccupancy + + ch <- collector.gpuTensorActive + + ch <- collector.gpuFP16Ops + + ch <- collector.gpuFP32Ops + + ch <- collector.gpuFP64Ops + + ch <- collector.gpuWriteSize + + ch <- collector.gpuReadSize } // Collect implements required collect function for all promehteus collectors. func (collector *amdDeviceMetricsCollector) Collect(ch chan<- prometheus.Metric) { - for idev := range collector.devices { + for _, dev := range collector.devices { ch <- prometheus.MustNewConstMetric( - collector.gpuUtil, prometheus.GaugeValue, 100*rand.Float64(), strconv.Itoa(idev), //nolint:gosec - "0", "Advanced Micro Devices Inc", + collector.gpuUtil, prometheus.GaugeValue, 100*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, ) + ch <- prometheus.MustNewConstMetric( - collector.gpuMemTotal, prometheus.GaugeValue, 1024*1024*1024*24, strconv.Itoa(idev), - "0", "Advanced Micro Devices Inc", + collector.gpuVRAMTotal, prometheus.GaugeValue, 1024*1024*1024*24, dev.ID, + dev.IID, dev.UUID, ) + ch <- prometheus.MustNewConstMetric( - collector.gpuMemUsed, prometheus.GaugeValue, 1024*1024*1024*24*rand.Float64(), strconv.Itoa(idev), //nolint:gosec - "0", "Advanced Micro Devices Inc", + collector.gpuVRAMUsed, prometheus.GaugeValue, 1024*1024*1024*24*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, ) - // GPU power reported in micro Watts + + ch <- prometheus.MustNewConstMetric( + collector.gpuGTTTotal, prometheus.GaugeValue, 1024*1024*24, dev.ID, + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuGTTUsed, prometheus.GaugeValue, 1024*1024*24*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuVisibleRAMTotal, prometheus.GaugeValue, 1024*1024*24, dev.ID, + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuVisibleRAMUsed, prometheus.GaugeValue, 1024*1024*24*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + var power float64 = 0 + if dev.IID == "0" { + power = randFloat(minAMDGPUPower, maxAMDGPUPower) + } + + ch <- prometheus.MustNewConstMetric( + collector.gpuPower, prometheus.GaugeValue, power, dev.ID, + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuSMActive, prometheus.GaugeValue, 100*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + ch <- prometheus.MustNewConstMetric( - collector.gpuPower, prometheus.GaugeValue, randFloat(minAMDGPUPower, maxAMDGPUPower), strconv.Itoa(idev), - "0", "Advanced Micro Devices Inc", + collector.gpuTensorActive, prometheus.GaugeValue, 100*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuProfOccupancy, prometheus.GaugeValue, 100*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP64Ops, prometheus.GaugeValue, 1e8*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP32Ops, prometheus.GaugeValue, 1e6*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuFP16Ops, prometheus.GaugeValue, 1e3*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuWriteSize, prometheus.GaugeValue, 5e6*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, + ) + + ch <- prometheus.MustNewConstMetric( + collector.gpuReadSize, prometheus.GaugeValue, 3e6*rand.Float64(), dev.ID, //nolint:gosec + dev.IID, dev.UUID, ) } } @@ -282,8 +578,10 @@ func dcgmExporter(ctx context.Context) { ReadHeaderTimeout: 3 * time.Second, Handler: mux, } + defer func() { - if err := server.Shutdown(ctx); err != nil { + err := server.Shutdown(ctx) + if err != nil { log.Println("Failed to shutdown fake NVIDIA DCGM exporter server", err) } }() @@ -309,8 +607,10 @@ func amdSMIExporter(ctx context.Context) { ReadHeaderTimeout: 3 * time.Second, Handler: mux, } + defer func() { - if err := server.Shutdown(ctx); err != nil { + err := server.Shutdown(ctx) + if err != nil { log.Println("Failed to shutdown fake AMD SMI exporter server", err) } }() @@ -322,8 +622,8 @@ func amdSMIExporter(ctx context.Context) { } } -func amdDeviceMetricsExporter(ctx context.Context) { - amdDeviceMetrics := newAMDDeviceMetricsCollector() +func amdDeviceMetricsExporter(ctx context.Context, prefix string) { + amdDeviceMetrics := newAMDDeviceMetricsCollector(prefix) amdDeviceMetricsRegistry := prometheus.NewRegistry() amdDeviceMetricsRegistry.MustRegister(amdDeviceMetrics) @@ -336,8 +636,10 @@ func amdDeviceMetricsExporter(ctx context.Context) { ReadHeaderTimeout: 3 * time.Second, Handler: mux, } + defer func() { - if err := server.Shutdown(ctx); err != nil { + err := server.Shutdown(ctx) + if err != nil { log.Println("Failed to shutdown fake AMD device metrics exporter server", err) } }() @@ -363,13 +665,13 @@ func main() { if slices.Contains(args, "test-mode") { minNvGPUPower = 200.0 maxNvGPUPower = 200.0 - minAMDGPUPower = 100000000.0 - maxAMDGPUPower = 100000000.0 + minAMDGPUPower = 100.0 + maxAMDGPUPower = 100.0 } else { minNvGPUPower = 60.0 maxNvGPUPower = 700.0 - minAMDGPUPower = 30000000.0 - maxAMDGPUPower = 500000000.0 + minAMDGPUPower = 30.0 + maxAMDGPUPower = 500.0 } if slices.Contains(args, "dcgm") { @@ -386,7 +688,7 @@ func main() { if slices.Contains(args, "amd-device-metrics") { go func() { - amdDeviceMetricsExporter(ctx) + amdDeviceMetricsExporter(ctx, "amd_") }() } diff --git a/scripts/packaging.sh b/scripts/packaging.sh index 7286b993..67c9901d 100755 --- a/scripts/packaging.sh +++ b/scripts/packaging.sh @@ -59,8 +59,8 @@ test () { systemctl is-active --quiet "${app}.service" && echo "${app}" is running done - # Test if cacct has setgid bit on it - [ -g "/usr/local/bin/cacct" ] && printf "cacct has setgid set\n" + # Test if cacct has setuid bit on it + [ -u "/usr/local/bin/cacct" ] && printf "cacct has setuid set\n" || printf "cacct does not have setuid set\n" } if [ ${build} -ne 0 ] diff --git a/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json b/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json index f0a22ee1..02bbda96 100644 --- a/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json +++ b/thirdparty/grafana/dashboards/k8s/k8s-single-pod-metrics.json @@ -1820,7 +1820,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_sm_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_sm_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "SM Activity on GPU: {{index}} on {{hostname}}", @@ -1833,7 +1833,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_sm_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_sm_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Activity on all GPUs", @@ -1846,7 +1846,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_sm_occupancy:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_sm_occupancy:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "SM Occupancy GPU: {{index}} on {{hostname}}", @@ -1859,7 +1859,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_sm_occupancy:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_sm_occupancy:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Occupancy on all GPUs", @@ -1964,7 +1964,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_gr_engine_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_gr_engine_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Graphics Engine Activity on GPU: {{index}} on {{hostname}}", @@ -1977,7 +1977,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_gr_engine_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_gr_engine_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Graphics Engine Activity on all GPUs", @@ -1990,7 +1990,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_tensor_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_tensor_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Tensor Pipe Activity on GPU: {{index}} on {{hostname}}", @@ -2003,7 +2003,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_tensor_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_tensor_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Tensor Pipe Activity on all GPUs", @@ -2108,7 +2108,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp64_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp64_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "FP64 Activity on GPU: {{index}} on {{hostname}}", @@ -2121,7 +2121,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp32_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp32_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "FP32 Activity on GPU: {{index}} on {{hostname}}", @@ -2134,7 +2134,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp16_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp16_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "FP16 Activity on GPU: {{index}} on {{hostname}}", @@ -2147,7 +2147,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp64_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp64_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP64 Engine Activity on all GPUs", @@ -2160,7 +2160,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp32_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp32_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP32 Engine Activity on all GPUs", @@ -2173,7 +2173,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp16_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp16_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP16 Engine Activity on all GPUs", @@ -2278,7 +2278,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_dram_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_dram_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "GPU: {{index}} on {{hostname}}", @@ -2291,7 +2291,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_dram_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_dram_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Memory BW Utilization on all GPUs", @@ -2396,7 +2396,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_nvlink_tx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{index}} on {{hostname}}", @@ -2409,7 +2409,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_nvlink_tx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Transmitted Bandwidth on all GPUs", @@ -2422,7 +2422,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_nvlink_rx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{index}} on {{hostname}}", @@ -2435,7 +2435,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_nvlink_rx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Received Bandwidth on all GPUs", @@ -2540,7 +2540,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pcie_tx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pcie_tx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{index}} on {{hostname}}", @@ -2553,7 +2553,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pcie_tx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pcie_tx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Transmitted Bandwidth on all GPUs", @@ -2566,7 +2566,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pcie_rx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", + "expr": "uuid:ceems_gpu_prof_pcie_rx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"}", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{index}} on {{hostname}}", @@ -2579,7 +2579,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pcie_rx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pcie_rx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Received Bandwidth on all GPUs", diff --git a/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json b/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json index 04acc16b..f871e235 100644 --- a/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json +++ b/thirdparty/grafana/dashboards/openstack/os-single-vm-metrics.json @@ -991,7 +991,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_sm_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_sm_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "SM Activity on GPU: {{gpu}}", @@ -1004,7 +1004,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_sm_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_sm_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Activity on all GPUs", @@ -1017,7 +1017,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_sm_occupancy:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_sm_occupancy:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "SM Occupancy GPU: {{gpu}}", @@ -1030,7 +1030,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_sm_occupancy:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_sm_occupancy:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Occupancy on all GPUs", @@ -1135,7 +1135,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_gr_engine_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_gr_engine_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Graphics Engine Activity on GPU: {{gpu}}", @@ -1148,7 +1148,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_gr_engine_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_gr_engine_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Graphics Engine Activity on all GPUs", @@ -1161,7 +1161,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_pipe_tensor_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_pipe_tensor_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Tensor Pipe Activity on GPU: {{gpu}}", @@ -1174,7 +1174,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pipe_tensor_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pipe_tensor_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Tensor Pipe Activity on all GPUs", @@ -1279,7 +1279,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_pipe_fp64_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_pipe_fp64_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "FP64 Activity on GPU: {{gpu}}", @@ -1292,7 +1292,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_pipe_fp32_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_pipe_fp32_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "FP32 Activity on GPU: {{gpu}}", @@ -1305,7 +1305,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_pipe_fp16_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_pipe_fp16_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "FP16 Activity on GPU: {{gpu}}", @@ -1318,7 +1318,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pipe_fp64_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pipe_fp64_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP64 Engine Activity on all GPUs", @@ -1331,7 +1331,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pipe_fp32_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pipe_fp32_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP32 Engine Activity on all GPUs", @@ -1344,7 +1344,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pipe_fp16_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pipe_fp16_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP16 Engine Activity on all GPUs", @@ -1449,7 +1449,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname) (uuid:ceems_gpu_dram_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname) (uuid:ceems_gpu_prof_dram_active:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "GPU: {{gpu}}", @@ -1462,7 +1462,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_dram_active:ratio{uuid=\"${uuid}\"})", + "expr": "avg without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_dram_active:ratio{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Average Memory BW Utilization on all GPUs", @@ -1567,7 +1567,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (uuid:ceems_gpu_nvlink_tx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname) (uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{gpu}}", @@ -1580,7 +1580,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_nvlink_tx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Transmitted Bandwidth on all GPUs", @@ -1593,7 +1593,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (uuid:ceems_gpu_nvlink_rx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname) (uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{gpu}}", @@ -1606,7 +1606,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_nvlink_rx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Received Bandwidth on all GPUs", @@ -1711,7 +1711,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (uuid:ceems_gpu_pcie_tx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname) (uuid:ceems_gpu_prof_pcie_tx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{gpu}}", @@ -1724,7 +1724,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pcie_tx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pcie_tx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Total Transmitted Bandwidth on all GPUs", @@ -1737,7 +1737,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname) (uuid:ceems_gpu_pcie_rx_bytes:ratio{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname) (uuid:ceems_gpu_prof_pcie_rx_bytes:sum{gpuuuid=~\"${gpu}\",uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{gpu}}", @@ -1750,7 +1750,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_pcie_rx_bytes:ratio{uuid=\"${uuid}\"})", + "expr": "sum without (instance,hostname,gpuuuid) (uuid:ceems_gpu_prof_pcie_rx_bytes:sum{uuid=\"${uuid}\"})", "hide": false, "instant": false, "legendFormat": "TOtal Received Bandwidth on all GPUs", diff --git a/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json b/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json index 9622c6e9..681e3c8f 100644 --- a/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json +++ b/thirdparty/grafana/dashboards/slurm/slurm-single-job-metrics.json @@ -1820,7 +1820,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_sm_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_sm_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "SM Activity on GPU: {{index}} on {{hostname}}", @@ -1833,7 +1833,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_sm_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_sm_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Activity on all GPUs", @@ -1846,7 +1846,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_sm_occupancy:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_sm_occupancy:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "SM Occupancy GPU: {{index}} on {{hostname}}", @@ -1859,7 +1859,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_sm_occupancy:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_sm_occupancy:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average SM Occupancy on all GPUs", @@ -1964,7 +1964,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_gr_engine_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_gr_engine_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Graphics Engine Activity on GPU: {{index}} on {{hostname}}", @@ -1977,7 +1977,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_gr_engine_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_gr_engine_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Graphics Engine Activity on all GPUs", @@ -1990,7 +1990,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_tensor_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_tensor_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Tensor Pipe Activity on GPU: {{index}} on {{hostname}}", @@ -2003,7 +2003,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_tensor_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_tensor_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Tensor Pipe Activity on all GPUs", @@ -2108,7 +2108,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp64_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp64_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "FP64 Activity on GPU: {{index}} on {{hostname}}", @@ -2121,7 +2121,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp32_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp32_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "FP32 Activity on GPU: {{index}} on {{hostname}}", @@ -2134,7 +2134,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pipe_fp16_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pipe_fp16_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "FP16 Activity on GPU: {{index}} on {{hostname}}", @@ -2147,7 +2147,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp64_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp64_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP64 Engine Activity on all GPUs", @@ -2160,7 +2160,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp32_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp32_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP32 Engine Activity on all GPUs", @@ -2173,7 +2173,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pipe_fp16_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pipe_fp16_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average FP16 Engine Activity on all GPUs", @@ -2278,7 +2278,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_dram_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_dram_active:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "GPU: {{index}} on {{hostname}}", @@ -2291,7 +2291,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_dram_active:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_dram_active:ratio{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Memory BW Utilization on all GPUs", @@ -2396,7 +2396,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_nvlink_tx_bytes:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{index}} on {{hostname}}", @@ -2409,7 +2409,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_nvlink_tx_bytes:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_nvlink_tx_bytes:sum{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Transmitted Bandwidth on all GPUs", @@ -2422,7 +2422,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_nvlink_rx_bytes:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{index}} on {{hostname}}", @@ -2435,7 +2435,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_nvlink_rx_bytes:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_nvlink_rx_bytes:sum{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Received Bandwidth on all GPUs", @@ -2540,7 +2540,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pcie_tx_bytes:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pcie_tx_bytes:sum{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Transmitted on GPU: {{index}} on {{hostname}}", @@ -2553,7 +2553,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pcie_tx_bytes:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pcie_tx_bytes:sum{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Transmitted Bandwidth on all GPUs", @@ -2566,7 +2566,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "uuid:ceems_gpu_pcie_rx_bytes:ratio{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", + "expr": "uuid:ceems_gpu_prof_pcie_rx_bytes:sum{instance=~\"${host}\",gpuuuid=~\"${gpu}\",uuid=\"${jobid}\"}", "hide": false, "instant": false, "legendFormat": "Received on GPU: {{index}} on {{hostname}}", @@ -2579,7 +2579,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "avg(uuid:ceems_gpu_pcie_rx_bytes:ratio{uuid=\"${jobid}\"})", + "expr": "avg(uuid:ceems_gpu_prof_pcie_rx_bytes:sum{uuid=\"${jobid}\"})", "hide": false, "instant": false, "legendFormat": "Average Received Bandwidth on all GPUs", diff --git a/website/docs/configuration/cacct.md b/website/docs/configuration/cacct.md index 8c4d64d4..d4e3a65a 100644 --- a/website/docs/configuration/cacct.md +++ b/website/docs/configuration/cacct.md @@ -4,15 +4,29 @@ sidebar_position: 8 # cacct -`cacct` is a CLI client that can be used instead of Grafana when operators cannot or do not wish to maintain a Grafana instance. This CLI client communicates with both the CEEMS API server and the TSDB server to fetch energy, usage, and performance metrics for a given compute unit, project, and/or user. It has been largely inspired by SLURM's [`sacct`](https://slurm.schedmd.com/sacct.html) tool, and the API resembles that of `sacct`. +`cacct` is a CLI client that can be used instead of Grafana when operators +cannot or do not wish to maintain a Grafana instance. This CLI client communicates +with both the CEEMS API server and the TSDB server to fetch energy, usage, +performance metrics for a given compute unit, project, and/or user. It has been largely +inspired by SLURM's [`sacct`](https://slurm.schedmd.com/sacct.html) tool, and the API +resembles that of `sacct`. :::important[IMPORTANT] -`cacct` identifies the current username from their Linux UID. Thus, for `cacct` to work correctly, the user's UID must be the same on the machine where `cacct` is executed and in the CEEMS API server database. +`cacct` identifies the current username from their Linux UID. Thus, for `cacct` +to work correctly, the user's UID must be the same on the machine where `cacct` is +executed and in the CEEMS API server database. ::: -This tool has been specifically designed for HPC platforms where there is a common login node that users can access via SSH. The tool must be installed on such login nodes along with its configuration file. The `cacct` configuration file contains the HTTP client configuration details needed to connect to the CEEMS API and TSDB servers. Consequently, this configuration file might contain secrets for communicating with these servers, making it crucial to protect this file on a multi-tenant system like HPC login nodes. This will be discussed further in the following sections. First, let's examine the available configuration sections for `cacct`: +This tool has been specifically designed for HPC platforms where there is a common +login node that users can access via SSH. The tool must be installed on such login +nodes along with its configuration file. The `cacct` configuration file contains the +HTTP client configuration details needed to connect to the CEEMS API and TSDB servers. +Consequently, this configuration file might contain secrets for communicating with these +servers, making it crucial to protect this file on a multi-tenant system like HPC login +nodes. This will be discussed further in the following sections. First, let's examine +the available configuration sections for `cacct`: ```yaml # cacct configuration skeleton @@ -24,7 +38,9 @@ tsdb: :::important[IMPORTANT] -`cacct` always looks for its configuration file at `/etc/ceems/config.yml` or `/etc/ceems/config.yaml`. Therefore, the configuration file must be installed in one of these locations. +`cacct` always looks for its configuration file at `/etc/ceems/config.yml` or +`/etc/ceems/config.yaml`. Therefore, the configuration file must be installed in +one of these locations. ::: @@ -41,9 +57,17 @@ ceems_api_server: password: supersecretpassword ``` -The above configuration assumes that the target cluster has `slurm-0` as its cluster ID, as configured in the [CEEMS API server configuration](./ceems-api-server.md#clusters-configuration). By default, the CEEMS API server expects the username in the `X-Grafana-User` header, so `cacct` sets the value for this header with the username making the request. Finally, the `web` section contains the HTTP client configuration for the CEEMS API server. In this example, the CEEMS API server is reachable at host `ceems-api-server` on port `9020`, and basic authentication is configured. +The above configuration assumes that the target cluster has `slurm-0` as its cluster +ID, as configured in the [CEEMS API server configuration](./ceems-api-server.md#clusters-configuration). +By default, the CEEMS API server expects the username in the `X-Grafana-User` header, +so `cacct` sets the value for this header with the username making the request. +Finally, the `web` section contains the HTTP client configuration for the CEEMS API +server. In this example, the CEEMS API server is reachable at host `ceems-api-server` +on port `9020`, and basic authentication is configured. -`cacct` can pull time series data from the TSDB server for the requested compute units. This is possible only when the `tsdb` section is configured. A sample configuration file including both CEEMS API server and TSDB server configurations is shown below: +`cacct` can pull time series data from the TSDB server for the requested compute units. +This is possible only when the `tsdb` section is configured. A sample configuration file +including both CEEMS API server and TSDB server configurations is shown below: ```yaml ceems_api_server: @@ -93,25 +117,45 @@ tsdb: io_write_bytes: irate(ceems_ebpf_write_bytes_total{uuid=~"%s"}[1m]) ``` -Similar to the CEEMS API server configuration, this example assumes the TSDB server is reachable at `tsdb:9090` and basic authentication is configured on the HTTP server. The `tsdb.queries` section is where operators configure the queries to pull time series data for each metric. If operators used [`ceems_tool`](../usage/ceems-tool.md) to generate recording rules for the TSDB, the queries in the sample configuration above will work out-of-the-box. The keys in the `queries` object can be chosen freely; they are provided for configuration file maintainability. The placeholder `%s` will be replaced by the compute unit UUIDs at runtime before executing the queries on the TSDB server. +Similar to the CEEMS API server configuration, this example assumes the TSDB server is +reachable at `tsdb:9090` and basic authentication is configured on the HTTP server. The +`tsdb.queries` section is where operators configure the queries to pull time series data +for each metric. If operators used [`ceems_tool`](../usage/ceems-tool.md) to generate +recording rules for the TSDB, the queries in the sample configuration above will work +out-of-the-box. The keys in the `queries` object can be chosen freely; they are provided +for configuration file maintainability. The placeholder `%s` will be replaced by the compute +unit UUIDs at runtime before executing the queries on the TSDB server. :::note[NOTE] -There is no risk of injection here, as the UUID values provided by the end-user are first sanitized and then verified with the CEEMS API server to check if the user is the owner of the compute unit before passing them to the TSDB server. +There is no risk of injection here, as the UUID values provided by the end-user are first +sanitized and then verified with the CEEMS API server to check if the user is the owner +of the compute unit before passing them to the TSDB server. ::: -A complete reference can be found in the [Reference](./config-reference.md) section. A valid sample configuration file can be found in the [repository](https://github.com/@ceemsOrg@/@ceemsRepo@/blob/main/build/config/cacct/cacct.yml). +A complete reference can be found in the [Reference](./config-reference.md) section. A valid +sample configuration file can be found in the +[repository](https://github.com/@ceemsOrg@/@ceemsRepo@/blob/main/build/config/cacct/cacct.yml). ## Securing configuration file -As evident from the previous section, the `cacct` configuration file contains secrets that should not be accessible to end-users. At the same time, the `cacct` executable must be accessible to end-users so they can fetch their usage statistics. This means `cacct` must be able to read the configuration file at runtime, but the user executing it should not. This can be achieved using the [Sticky bit](https://www.redhat.com/en/blog/suid-sgid-sticky-bit). +As evident from the previous section, the `cacct` configuration file contains secrets that +should not be accessible to end-users. At the same time, the `cacct` executable must be +accessible to end-users so they can fetch their usage statistics. This means `cacct` must +be able to read the configuration file at runtime, but the user executing it should not. +This can be achieved using the [Sticky bit](https://www.redhat.com/en/blog/suid-sgid-sticky-bit). -By using the SETUID or SETGID bit on the executable, the binary executes as the user or group that owns the file, not the user who invokes the execution. For instance, imagine a case where a system user/group `ceems` is created on an HPC login node. The SETGID sticky bit can be set on `cacct` as follows: +By using the SETUID or SETGID bit on the executable, the binary will have privileges of the user or +group that owns the file. Thus, a SETUID `ceems` owned file can read config file owned by `ceems`. +Once the config file has been read, `cacct` will drop privileges and executes rest of code as the +user who invoked it. This way the privileges are only kept for a minimal time to read config file +and dropped after fetching config. The SETGID sticky bit +can be set on `cacct` as follows: ```bash chown ceems:ceems /usr/local/bin/cacct -chmod g+s /usr/local/bin/cacct +chmod u+s /usr/local/bin/cacct # Ensure others can execute cacct chmod o+x /usr/local/bin/cacct @@ -121,6 +165,9 @@ chown ceems:ceems /etc/ceems/config.yml chmod o-rwx /etc/ceems/config.yml ``` -Now, every time `cacct` is invoked, it runs as the `ceems` user/group instead of the user who invoked it. Since the same user/group owns `/etc/ceems/config.yml`, `cacct` can read the file. Simultaneously, the user who invoked the `cacct` binary cannot access `/etc/ceems/config.yml` because their permissions have been revoked. +Now, every time `cacct` is invoked, it will have privileges of the `ceems` user/group instead to read +`/etc/ceems/config.yml` and drop privileges to user who invoked the program later. -When `cacct` is installed using the RPM/DEB file provided by the [CEEMS Releases](https://github.com/@ceemsOrg@/@ceemsRepo@/releases), `cacct` is already installed with the sticky bit set. Operators only need to populate the configuration file at `/etc/ceems/config.yml`. +When `cacct` is installed using the RPM/DEB file provided by the +[CEEMS Releases](https://github.com/@ceemsOrg@/@ceemsRepo@/releases), `cacct` is already installed with +the sticky bit set. Operators only need to populate the configuration file at `/etc/ceems/config.yml`. diff --git a/website/docs/configuration/ceems-exporter.md b/website/docs/configuration/ceems-exporter.md index 97789f08..a28fcc27 100644 --- a/website/docs/configuration/ceems-exporter.md +++ b/website/docs/configuration/ceems-exporter.md @@ -675,6 +675,13 @@ an error. In that case, do not enable the collector. ### RAPL collector +:::important[IMPORTANT] + +Starting from `v0.11.0`, the RAPL collector is disabled by default and must +be explicitly enabled using `--collector.rapl` CLI flag. + +::: + For kernels that are `<5.3`, there is no special configuration to be done. If the kernel version is `>=5.3`, RAPL metrics are only available for `root`. Three approaches can be envisioned here: diff --git a/website/docs/configuration/prometheus.md b/website/docs/configuration/prometheus.md index 80c92849..1a0a1ca9 100644 --- a/website/docs/configuration/prometheus.md +++ b/website/docs/configuration/prometheus.md @@ -4,6 +4,14 @@ sidebar_position: 6 # Prometheus +:::important[IMPORTANT] + +From version `v0.11.0`, this configuration is not anymore necessary and if recording +rules are generated [`ceems_tool`](../usage/ceems-tool.md), the relabeling will be +handled directly in the rules. + +::: + In order to use the dashboards provided in the repository, minor [`metric_relabel_configs`](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) configuration must be provided for all target groups that have NVIDIA GPUs where diff --git a/website/docs/deployments/guide.md b/website/docs/deployments/guide.md index 344aacd0..571f3f70 100644 --- a/website/docs/deployments/guide.md +++ b/website/docs/deployments/guide.md @@ -400,27 +400,6 @@ scrape_configs: basic_auth: username: ceems password: - # This relabel_config must be added to all - # scrape jobs that have DCGM targets - metric_relabel_configs: - - source_labels: - - modelName - - UUID - target_label: gpuuuid - regex: NVIDIA(.*);(.*) - replacement: $2 - action: replace - - source_labels: - - modelName - - GPU_I_ID - target_label: gpuiid - regex: NVIDIA(.*);(.*) - replacement: $2 - action: replace - - regex: UUID - action: labeldrop - - regex: GPU_I_ID - action: labeldrop static_configs: - targets: - compute-gpu-0:9010 @@ -440,7 +419,7 @@ scrape_configs: - service-0:9010 ``` -:::important[IMPORTANT] + This is only basic configuration and more options can be found in the [Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration) @@ -493,9 +472,11 @@ Recording rules can be created using `ceems_tool` using the following command: :::important[IMPORTANT] -When [Redfish Collector](../components/ceems-exporter.md#energy-related-collectors) is enabled -on CEEMS exporters and if Redfish server has multiple chassis defined, the above command will ask -for the user input on which chassis must be used in estimated power consumption. As different chassis +When [Redfish Collector](../components/ceems-exporter.md#energy-related-collectors) +or [HWMon Collector](../components/ceems-exporter.md#hwmon-collector) is enabled +on CEEMS exporters and if Redfish server has multiple chassis or hwmon has multiple chips defined, +the above command will ask for the user input on which chassis/chip must be used in estimated +power consumption. As different chassis/chip can report power consumption of different components, operators must choose a chassis that reports power consumption of host. diff --git a/website/docs/usage/ceems-tool.md b/website/docs/usage/ceems-tool.md index 6fb7cfdb..a832c9ae 100644 --- a/website/docs/usage/ceems-tool.md +++ b/website/docs/usage/ceems-tool.md @@ -72,7 +72,10 @@ the `--emission-factor` CLI flag. However, it is not possible to set both `--cou If the [Redfish Collector](../configuration/ceems-exporter.md#redfish-collector) is being used and it has multiple chassis, the above will ask for user input on which chassis must be included in the -recording rules. Operators must choose the chassis that reports the host power usage. +recording rules. Operators must choose the chassis that reports the host power usage. Similarly, if +[HWMon Collector](../configuration/ceems-exporter.md#hwmon-collector) is enabled and there are power +metrics available for multiple components, the operator must choose the component that reports the +host power usage. :::