NVIDIA · cluster2600 · Feb 18, 2026
diff --git a/pkg/dcgm/diag.go b/pkg/dcgm/diag.go
@@ -11,8 +11,6 @@ import (
 	"unsafe"
 )
 
-// Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
-
 // DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings
 const DIAG_RESULT_STRING_SIZE = 1024
 

diff --git a/pkg/dcgm/doc.go b/pkg/dcgm/doc.go
@@ -0,0 +1,186 @@
+/*
+Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package dcgm provides Go bindings for the NVIDIA Data Center GPU Manager
+// (DCGM) C library.
+//
+// # Overview
+//
+// DCGM is a suite of tools for managing and monitoring NVIDIA data-center GPUs
+// in cluster environments. This package wraps the DCGM C API and exposes it as
+// idiomatic Go, including health checks, field watches, diagnostics, GPU group
+// management, policy violation monitoring, and topology queries.
+//
+// # Initialization
+//
+// Every program must call [Init] before using any other function, and should
+// call the returned cleanup function (or [Shutdown]) before exiting.  Three
+// operating modes are supported:
+//
+//   - [Embedded] – start the DCGM host engine inside the current process.
+//     Suitable for standalone tools and tests.
+//   - [Standalone] – connect to an already-running nv-hostengine daemon.
+//     Pass the daemon's address as an additional argument.
+//   - [StartHostengine] – spawn nv-hostengine as a child process, connect to
+//     it, and automatically terminate it on shutdown.
+//
+// Example – embedded mode:
+//
+//	cleanup, err := dcgm.Init(dcgm.Embedded)
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	defer cleanup()
+//
+// Example – standalone mode (connect to a running nv-hostengine):
+//
+//	cleanup, err := dcgm.Init(dcgm.Standalone, "localhost:5555")
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	defer cleanup()
+//
+// # Field Watches
+//
+// Monitoring GPU metrics is a three-step process:
+//
+//  1. Create a field group with the metric IDs you want to watch.
+//  2. Create a GPU group (or use [GroupAllGPUs]) and start the watch.
+//  3. Read values with [GetValuesSince] and clean up when done.
+//
+// Example:
+//
+//	cleanup, _ := dcgm.Init(dcgm.Embedded)
+//	defer cleanup()
+//
+//	fields := []dcgm.Short{
+//	    dcgm.DCGM_FI_DEV_GPU_TEMP,
+//	    dcgm.DCGM_FI_DEV_POWER_USAGE,
+//	}
+//
+//	fieldGroup, err := dcgm.FieldGroupCreate("myFields", fields)
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	defer dcgm.FieldGroupDestroy(fieldGroup)
+//
+//	gpuGroup := dcgm.GroupAllGPUs()
+//	if err := dcgm.WatchFieldsWithGroup(fieldGroup, gpuGroup); err != nil {
+//	    log.Fatal(err)
+//	}
+//	defer dcgm.UnwatchFields(fieldGroup, gpuGroup)
+//
+//	values, _, err := dcgm.GetValuesSince(gpuGroup, fieldGroup, time.Time{})
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	for _, v := range values {
+//	    fmt.Printf("GPU %d field %d: %v\n", v.EntityID, v.FieldID, v.Int64())
+//	}
+//
+// # GPU Groups
+//
+// GPU groups let you apply operations to a named set of GPUs.  Use
+// [GroupAllGPUs] to target every GPU on the system, or [CreateGroup] to build
+// a custom group.  Groups must be destroyed with [DestroyGroup] when no longer
+// needed.
+//
+//	group, err := dcgm.CreateGroup("workers")
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	defer dcgm.DestroyGroup(group)
+//
+//	// Add GPU 0 and GPU 1 to the group.
+//	_ = dcgm.AddToGroup(group, 0)
+//	_ = dcgm.AddToGroup(group, 1)
+//
+// # Health Checks
+//
+// Passive health monitoring tracks PCIe errors, NVLink faults, memory
+// failures, and more.  Enable the watches for a group, then call [HealthCheck]
+// to retrieve the current status.
+//
+//	dcgm.HealthSet(group, dcgm.DCGM_HEALTH_WATCH_ALL)
+//	response, err := dcgm.HealthCheck(group)
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	for _, incident := range response.Incidents {
+//	    fmt.Printf("GPU %d: %s – %s\n",
+//	        incident.EntityInfo.EntityId,
+//	        incident.System,
+//	        incident.Health)
+//	}
+//
+// # Policy Violation Monitoring
+//
+// Register callbacks for GPU policy violations (ECC errors, XID events, power
+// limits, etc.) using [ListenForPolicyViolations].  The function returns a
+// channel that receives [PolicyViolation] values.  Always cancel the context
+// when monitoring is no longer required to avoid goroutine leaks.
+//
+//	ctx, cancel := context.WithCancel(context.Background())
+//	defer cancel()
+//
+//	violations, err := dcgm.ListenForPolicyViolations(ctx,
+//	    dcgm.POLICY_COND_DBE,
+//	    dcgm.POLICY_COND_NVLINK,
+//	)
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//
+//	for v := range violations {
+//	    fmt.Printf("Policy violation on GPU %d: condition %v\n",
+//	        v.Condition, v.Val)
+//	}
+//
+// # Diagnostics
+//
+// Run the built-in GPU diagnostic suite with [RunDiag].  The diagnostics are
+// graduated by level ([DiagQuick], [DiagShort], [DiagMedium], [DiagLong]).
+//
+//	results, err := dcgm.RunDiag(dcgm.DiagShort, dcgm.GroupAllGPUs())
+//	if err != nil {
+//	    log.Fatal(err)
+//	}
+//	for _, r := range results.Results {
+//	    fmt.Printf("GPU %d test %s: %s\n", r.GPU, r.TestName, r.Result)
+//	}
+//
+// # Thread Safety
+//
+// [Init] and [Shutdown] are protected by an internal mutex and are safe to
+// call concurrently.  All other functions assume that [Init] has completed
+// successfully.  The DCGM C library itself is thread-safe for read operations;
+// consult the DCGM documentation for write-operation constraints.
+//
+// # Resource Management
+//
+// Many objects allocated by DCGM (groups, field groups, etc.) must be
+// explicitly released.  The idiomatic pattern is to pair each Create/Watch
+// call with a deferred Destroy/Unwatch call:
+//
+//	group, _ := dcgm.CreateGroup("g")
+//	defer dcgm.DestroyGroup(group)
+//
+//	fg, _ := dcgm.FieldGroupCreate("fg", fields)
+//	defer dcgm.FieldGroupDestroy(fg)
+//
+// Failing to release resources will cause memory leaks inside nv-hostengine
+// for the duration of the process.
+package dcgm
diff --git a/pkg/dcgm/fields.go b/pkg/dcgm/fields.go
@@ -143,13 +143,20 @@ func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId
 	return group, nil
 }
 
-// WatchFieldsWithGroupEx starts monitoring fields with custom parameters.
-// fieldsGroup is the handle of the field group to watch.
-// group is the group handle to associate with the watch.
-// updateFreq is the update frequency in microseconds.
-// maxKeepAge is the maximum age of samples to keep in seconds.
-// maxKeepSamples is the maximum number of samples to keep.
-// Returns an error if the watch operation fails.
+// WatchFieldsWithGroupEx starts monitoring the fields in fieldsGroup for all
+// entities in group with caller-supplied sampling parameters.
+//
+//   - updateFreq: how often DCGM samples the fields, in microseconds.
+//   - maxKeepAge: how long (in seconds) DCGM retains historical samples; 0
+//     means keep samples until maxKeepSamples is reached.
+//   - maxKeepSamples: maximum number of samples to retain per field per entity;
+//     0 means unlimited (bounded only by maxKeepAge).
+//
+// After the watch is established, DCGM immediately triggers a field update so
+// values are available without waiting for the first sampling interval.
+//
+// Use [WatchFieldsWithGroup] when the default parameters are acceptable.
+// Call [UnwatchFields] to stop monitoring and release the associated resources.
 func WatchFieldsWithGroupEx(
 	fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32,
 ) error {
@@ -167,10 +174,29 @@ func WatchFieldsWithGroupEx(
 	return nil
 }
 
-// WatchFieldsWithGroup starts monitoring fields using default parameters.
-// fieldsGroup is the handle of the field group to watch.
-// group is the group handle to associate with the watch.
-// Returns an error if the watch operation fails.
+// WatchFieldsWithGroup starts monitoring the fields in fieldsGroup for all
+// entities in group using the default sampling parameters:
+//
+//   - updateFreq: 30 seconds (30,000,000 µs)
+//   - maxKeepAge: 0 (unlimited; bounded by maxKeepSamples)
+//   - maxKeepSamples: 1 (only the most recent sample is kept)
+//
+// For custom sampling parameters use [WatchFieldsWithGroupEx].
+// Call [UnwatchFields] to stop monitoring and release resources.
+//
+// Example:
+//
+//	fieldGroup, _ := dcgm.FieldGroupCreate("temps", []dcgm.Short{dcgm.DCGM_FI_DEV_GPU_TEMP})
+//	defer dcgm.FieldGroupDestroy(fieldGroup)
+//
+//	gpuGroup := dcgm.GroupAllGPUs()
+//	if err := dcgm.WatchFieldsWithGroup(fieldGroup, gpuGroup); err != nil {
+//	    return err
+//	}
+//	defer dcgm.UnwatchFields(fieldGroup, gpuGroup)
+//
+//	values, _, err := dcgm.GetValuesSince(gpuGroup, fieldGroup, time.Time{})
+//	// process values...
 func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error {
 	return WatchFieldsWithGroupEx(fieldsGroup, group, defaultUpdateFreq, defaultMaxKeepAge, defaultMaxKeepSamples)
 }

diff --git a/pkg/dcgm/hostengine_status.go b/pkg/dcgm/hostengine_status.go
@@ -1,4 +1,3 @@
-// Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
 package dcgm
 
 /*