Skip to content

Updates and fixes to recording rules subcommand of ceems_tool #397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/step_images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ jobs:

- name: Push README to registry
uses: christian-korneck/update-container-description-action@d36005551adeaba9698d8d67a296bd16fa91f8e8 # v1
if: (github.ref == 'refs/heads/main' || (github.event_name == 'push' && contains(github.ref, 'refs/tags/'))) && github.repository_owner == 'ceems' # Don't run this workflow on forks.
if: (github.ref == 'refs/heads/main' || (github.event_name == 'push' && contains(github.ref, 'refs/tags/'))) && github.repository_owner == 'ceems-dev' # Don't run this workflow on forks.
env:
# For dockerhub registry
DOCKER_USER: ${{ secrets.login }}
Expand Down
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Changelog

## 0.11.0 / 2025-*-*
## 0.11.0 / 2025-*

### Breaking Changes

#### CEEMS Exporter

- Collector `rapl` is disabled by default now and to enable it add `--collector.rapl` to CLI arguments.
- Collector `ipmi_dcmi` has been renamed to `ipmi` as more functionality beyond DCMI has been added to the collector.
- Following metric labels have been renamed to be more consistent with Prometheus naming convention:
* `ceems_ipmi_dcmi_current_watts` -> `ceems_ipmi_dcmi_power_current_watts`
Expand All @@ -17,6 +18,20 @@
* `ceems_redfish_max_watts` -> `ceems_redfish_power_max_watts`
* `ceems_redfish_avg_watts` -> `ceems_redfish_power_avg_watts`

#### CEEMS tool

- The relabel configs generated by subcommand `create-relabel-configs` are obsolete as the relabelling of metrics directly handled inside the recording rules. Please
regenerate recording rules with new version and remove existing relabel configs on Prometheus server.
- Several minor bugs in recording rules have been fixed. Please regenerate the recording rules with new version of `ceems_tool`.
- GPU profiling metrics have been renamed to have `prof` in the metric label. For instance, `uuid:ceems_gpu_sm_active:ratio` became
`uuid:ceems_gpu_prof_sm_active:ratio`.
- NVIDIA profiling metrics suffix has been corrected to use `sum` instead of `ratio` for NVLink, PCIe traffic metrics. Thus, metrics
have been renamed as follows:
* `uuid:ceems_gpu_pcie_tx_bytes:ratio` -> `uuid:ceems_gpu_prof_pcie_tx_bytes:sum`
* `uuid:ceems_gpu_pcie_rx_bytes:ratio` -> `uuid:ceems_gpu_prof_pcie_rx_bytes:sum`
* `uuid:ceems_gpu_nvlink_tx_bytes:ratio` -> `uuid:ceems_gpu_prof_nvlink_tx_bytes:sum`
* `uuid:ceems_gpu_nvlink_rx_bytes:ratio` -> `uuid:ceems_gpu_prof_nvlink_rx_bytes:sum`

## 0.10.2 / 2025-08-07

- [BUGFIX] Fix bpf code to work with LLVM 20 [#393](https://github.com/mahendrapaipuri/ceems/pull/393) ([@mahendrapaipuri](https://github.com/mahendrapaipuri))
Expand Down
4 changes: 2 additions & 2 deletions build/package/cacct/postinstall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
cleanInstall() {
printf "\033[32m Post Install of an clean install\033[0m\n"
# Step 3 (clean install), setup setgid bit on cacct
chmod g+s /usr/local/bin/cacct
chmod u+s /usr/local/bin/cacct
}

upgrade() {
printf "\033[32m Post Install of an upgrade\033[0m\n"
# Step 3(upgrade), setup setgid bit on cacct
chmod g+s /usr/local/bin/cacct
chmod u+s /usr/local/bin/cacct
}

# Step 2, check if this is a clean install or an upgrade
Expand Down
20 changes: 8 additions & 12 deletions cmd/cacct/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const (

// stats returns units and usage structs by making requests to CEEMS API server.
func stats(
config *Config,
currentUser string,
start time.Time,
end time.Time,
Expand All @@ -35,15 +36,6 @@ func stats(
tsData bool,
tsDataOut string,
) ([]models.Unit, []models.Usage, error) {
// By this time, user input is validated. Time to read config file
// to get HTTP config to connect to CEEMS API server.
// Either setuid or setgid bits must be applied on the app so that
// the config file can be read as the owner of this app
config, err := readConfig()
if err != nil {
return nil, nil, fmt.Errorf("failed to read config file: %w", err)
}

// Add user header to HTTP config
userHeaders := http_config.Header{
Values: []string{currentUser},
Expand Down Expand Up @@ -143,7 +135,8 @@ func stats(
return units, usage, nil
}

if err := tsdbData(ctx, config, units, tsDataOut); err != nil {
err := tsdbData(ctx, config, units, tsDataOut)
if err != nil {
fmt.Fprintln(os.Stderr, "failed to fetch time series data", err)
}
}
Expand All @@ -163,7 +156,8 @@ func makeRequest[T any](ctx context.Context, reqURL string, urlValues url.Values
req.URL.RawQuery = urlValues.Encode()

// Make request
if resp, err := client.Do(req); err != nil {
resp, err := client.Do(req)
if err != nil {
return nil, err
} else {
defer resp.Body.Close()
Expand All @@ -181,7 +175,9 @@ func makeRequest[T any](ctx context.Context, reqURL string, urlValues url.Values

// Unpack into data
var data Response[T]
if err = json.Unmarshal(body, &data); err != nil {

err = json.Unmarshal(body, &data)
if err != nil {
return nil, err
}

Expand Down
111 changes: 82 additions & 29 deletions cmd/cacct/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import (
"os/user"
"path/filepath"
"slices"
"strconv"
"strings"
"syscall"
"time"

"github.com/alecthomas/kingpin/v2"
Expand Down Expand Up @@ -301,7 +303,8 @@ func (c *Config) UnmarshalYAML(unmarshal func(any) error) error {

type plain Config

if err := unmarshal((*plain)(c)); err != nil {
err := unmarshal((*plain)(c))
if err != nil {
return err
}

Expand All @@ -322,14 +325,16 @@ func (w *WebConfig) UnmarshalYAML(unmarshal func(any) error) error {

type plain WebConfig

if err := unmarshal((*plain)(w)); err != nil {
err := unmarshal((*plain)(w))
if err != nil {
return err
}

// The UnmarshalYAML method of HTTPClientConfig is not being called because it's not a pointer.
// We cannot make it a pointer as the parser panics for inlined pointer structs.
// Thus we just do its validation here.
if err := w.HTTPClientConfig.Validate(); err != nil {
err = w.HTTPClientConfig.Validate()
if err != nil {
return err
}

Expand Down Expand Up @@ -398,7 +403,8 @@ func main() {
"markdown", "Produce markdown output (default: false).",
).Default("false").BoolVar(&mdOut)

if _, err := cacctApp.Parse(os.Args[1:]); err != nil {
_, err := cacctApp.Parse(os.Args[1:])
if err != nil {
kingpin.Fatalf("failed to parse CLI flags: %v", err)
}

Expand Down Expand Up @@ -444,25 +450,26 @@ func main() {
// Always add started and ended ts fields as we will need them for TSDB data retrieval
fields = append(fields, []string{"started_at_ts", "ended_at_ts"}...)

// Ensure --job flag is passed when asking for metric data
// This is to avoid fetching metrics of too many jobs when only
// period is set
if tsData && len(jobs) == 0 {
kingpin.Fatalf("explicit job IDs must be passed using --job when --ts is enabled")
}

// Convert start and end times to time.Time
var start, end time.Time

var err error
if start, err = parseTime(startTime); err != nil {
start, err = parseTime(startTime)
if err != nil {
kingpin.Fatalf("failed to parse --starttime flag: %v", err)
}

if end, err = parseTime(endTime); err != nil {
end, err = parseTime(endTime)
if err != nil {
kingpin.Fatalf("failed to parse --endtime flag: %v", err)
}

// Ensure to limit period to 1 week asking for metric data
// This is to avoid fetching metrics of too many jobs when only
// period is set
if tsData && end.Sub(start) > 7*24*time.Hour {
kingpin.Fatalf("limit period between --starttime and --endtime to 7 days when --ts is enabled")
}

// Get current user and add user's config dir to slice of config
// dirs.
// If current user is root and mockCurrentUser and/or mockConfigPath
Expand All @@ -474,18 +481,56 @@ func main() {
}

// Check if currentUser is only user in userNames and if so, set userNames to nil
if len(userNames) == 1 && userNames[0] == currentUser {
if len(userNames) == 1 && userNames[0] == currentUser.Username {
userNames = nil
}

// By this time, user input is validated. Time to read config file
// to get HTTP config to connect to CEEMS API server.
// Either setuid or setgid bits must be applied on the app so that
// the config file can be read as the owner of this app
config, err := readConfig()
if err != nil {
os.Exit(checkErr(fmt.Errorf("failed to read config file: %w", err)))
}

// Now time to drop privileges so that rest of app will be run as regular user
// who invoked it. It is necessary so to be able to create directories and files
// to user's space.
// The condition ensures that it will be executed only in production and not in e2e
// test cases
if mockCurrentUser == "" && mockConfigPath == "" {
// Convert UID anf GID to int
uid, err := strconv.Atoi(currentUser.Uid)
if err != nil {
os.Exit(checkErr(fmt.Errorf("failed to get current user uid: %w", err)))
}

gid, err := strconv.Atoi(currentUser.Gid)
if err != nil {
os.Exit(checkErr(fmt.Errorf("failed to get current user gid: %w", err)))
}

// Set UID and GID to current user
err = syscall.Setuid(uid)
if err != nil {
os.Exit(checkErr(fmt.Errorf("failed to set current user uid: %w", err)))
}

err = syscall.Setgid(gid)
if err != nil {
os.Exit(checkErr(fmt.Errorf("failed to set current user gid: %w", err)))
}
}

// Get stats
units, usages, err := stats(currentUser, start, end, accounts, jobs, userNames, fields, tsData, tsDataOut)
units, usages, err := stats(config, currentUser.Username, start, end, accounts, jobs, userNames, fields, tsData, tsDataOut)
if err != nil {
os.Exit(checkErr(err))
}

// Print stats as table
t := newTable(currentUser, userNames, units, usages)
t := newTable(currentUser.Username, userNames, units, usages)

// Based on request rendering format
switch {
Expand Down Expand Up @@ -642,14 +687,17 @@ func readConfig() (*Config, error) {
for _, configPath := range configPaths {
for _, file := range []string{"config.yml", "config.yaml", "cacct.yml", "cacct.yaml"} {
configFile := filepath.Join(configPath, file)
if _, err := os.Stat(configFile); err == nil {

_, err := os.Stat(configFile)
if err == nil {
// Read config file
cfg, err := os.ReadFile(configFile)
if err != nil {
return nil, err
}

if err = yaml.Unmarshal(cfg, &config); err != nil {
err = yaml.Unmarshal(cfg, &config)
if err != nil {
return nil, err
}

Expand All @@ -663,32 +711,34 @@ func readConfig() (*Config, error) {

// getCurrentUser returns the actual user executing the cacct. If --current-user
// CLI flag is passed, that user will be returned as current user.
func getCurrentUser(mockUserName string, mockConfigPath string) (string, error) {
func getCurrentUser(mockUserName string, mockConfigPath string) (*user.User, error) {
// Get current user is who is executing cacct
var currentUser string
var currentUser *user.User

if u, err := user.Current(); err != nil {
return "", fmt.Errorf("failed to get current user: %w", err)
// Get effective UID as cacct is a setuid binary
u, err := user.Current()
if err != nil {
return nil, fmt.Errorf("failed to get current user: %w", err)
} else {
// Check if mockUserName is set. This will be always empty string
// for production builds as we do not compile flags for production
// builds
if mockUserName != "" {
currentUser = mockUserName
currentUser = &user.User{Username: mockUserName}

// If mockConfigPath is set as well, add to configPaths
if mockConfigPath != "" {
configPaths = append(configPaths, mockConfigPath)
}
} else {
currentUser = u.Name
currentUser = u
}
}

// Add user HOME to configPaths
userConfigDir, err := os.UserConfigDir()
if err != nil {
return "", fmt.Errorf("failed to get config file: %w", err)
return nil, fmt.Errorf("failed to get config file: %w", err)
}

configPaths = append(configPaths, filepath.Join(userConfigDir, "ceems"))
Expand All @@ -698,17 +748,20 @@ func getCurrentUser(mockUserName string, mockConfigPath string) (string, error)

func parseTime(s string) (time.Time, error) {
// First attempt is to parse as YYYY-MM-DDTHH:MM:SS
if t, err := time.Parse("2006-01-02T15:04:05", s); err == nil {
t, err := time.Parse("2006-01-02T15:04:05", s)
if err == nil {
return t.In(time.Local), nil
}

// Second attempt is to parse as YYYY-MM-DDTHH:MM
if t, err := time.Parse("2006-01-02T15:04", s); err == nil {
t, err = time.Parse("2006-01-02T15:04", s)
if err == nil {
return t.In(time.Local), nil
}

// Third attempt is to parse as YYYY-MM-DD
if t, err := time.Parse("2006-01-02", s); err == nil {
t, err = time.Parse("2006-01-02", s)
if err == nil {
return t.In(time.Local), nil
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/cacct/testdata/output/e2e-test-cacct-tsdata-fail.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cacct: error: explicit job IDs must be passed using --job when --ts is enabled
cacct: error: limit period between --starttime and --endtime to 7 days when --ts is enabled
1 change: 1 addition & 0 deletions cmd/cacct/testdata/output/e2e-test-cacct-tsdata.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
{
"fingerprint": "554b56cadf9dea4b",
"labels": {
"__name__": "cpu_usage",
"instance": "localhost:9090",
"uuid": "147973"
}
Expand Down
Loading