From dbae7dafbcaabedba119a6b49df479ade569b423 Mon Sep 17 00:00:00 2001 From: Jan-Hendrik Boll Date: Fri, 20 Feb 2026 14:50:48 +0000 Subject: [PATCH 1/4] Add kubeevents and systemd logs to must gather output --- tooling/hcpctl/cmd/must-gather/query_cmd.go | 3 +- .../hcpctl/cmd/must-gather/query_options.go | 12 +++-- .../hcpctl/pkg/kusto/configurable_query.go | 11 ++++ tooling/hcpctl/pkg/mustgather/gather.go | 36 +++++++++++-- tooling/hcpctl/pkg/mustgather/queries.go | 53 ++++++++++++++++++- tooling/hcpctl/pkg/mustgather/queries_test.go | 6 +-- 6 files changed, 109 insertions(+), 12 deletions(-) diff --git a/tooling/hcpctl/cmd/must-gather/query_cmd.go b/tooling/hcpctl/cmd/must-gather/query_cmd.go index 09e1e75793..f42ed75b20 100644 --- a/tooling/hcpctl/cmd/must-gather/query_cmd.go +++ b/tooling/hcpctl/cmd/must-gather/query_cmd.go @@ -58,7 +58,7 @@ func (opts *MustGatherOptions) Run(ctx context.Context) error { } }() - queryOptions, err := mustgather.NewQueryOptions(opts.SubscriptionID, opts.ResourceGroup, opts.ResourceId, opts.TimestampMin, opts.TimestampMax, opts.Limit) + queryOptions, err := mustgather.NewQueryOptions(opts.SubscriptionID, opts.ResourceGroup, opts.ResourceId, opts.InfraClusterName, opts.TimestampMin, opts.TimestampMax, opts.Limit) if err != nil { return fmt.Errorf("failed to create query options: %w", err) } @@ -66,6 +66,7 @@ func (opts *MustGatherOptions) Run(ctx context.Context) error { gatherer := mustgather.NewCliGatherer(opts.QueryClient, opts.OutputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ QueryOptions: queryOptions, SkipHostedControlPlaneLogs: opts.SkipHostedControlPlaneLogs, + GatherInfraLogs: opts.GatherInfraLogs, }) err = gatherer.GatherLogs(ctx) diff --git a/tooling/hcpctl/cmd/must-gather/query_options.go b/tooling/hcpctl/cmd/must-gather/query_options.go index 3735fa11ca..9781d5069c 100644 --- a/tooling/hcpctl/cmd/must-gather/query_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_options.go @@ -42,6 +42,8 @@ type RawMustGatherOptions struct { TimestampMin time.Time // Timestamp minimum TimestampMax time.Time // Timestamp maximum Limit int // Limit the number of results + GatherInfraLogs bool // Gather all logs from the infrastructure, does NOT gather HCP logs + InfraClusterName string // Name of the infrastructure cluster } // DefaultMustGatherOptions returns a new RawMustGatherOptions struct initialized with sensible defaults. @@ -80,6 +82,8 @@ func BindMustGatherOptions(opts *RawMustGatherOptions, cmd *cobra.Command) error cmd.Flags().StringVar(&opts.Region, "region", opts.Region, "Azure Data Explorer cluster region (required)") cmd.Flags().DurationVar(&opts.QueryTimeout, "query-timeout", opts.QueryTimeout, "timeout for query execution") cmd.Flags().StringVar(&opts.OutputPath, "output-path", opts.OutputPath, "path to write the output file") + cmd.Flags().BoolVar(&opts.GatherInfraLogs, "gather-infra-logs", false, "gather all logs from the infrastructure, does NOT gather HCP logs") + cmd.Flags().StringVar(&opts.InfraClusterName, "infra-cluster-name", opts.InfraClusterName, "name of the infrastructure cluster") cmd.Flags().StringVar(&opts.SubscriptionID, "subscription-id", opts.SubscriptionID, "subscription ID") cmd.Flags().StringVar(&opts.ResourceGroup, "resource-group", opts.ResourceGroup, "resource group") cmd.Flags().StringVar(&opts.ResourceId, "resource-id", opts.ResourceId, "resource ID") @@ -141,17 +145,17 @@ func (o *RawMustGatherOptions) Validate(ctx context.Context) (*ValidatedMustGath } // Validate subscription ID - if o.SubscriptionID == "" && o.ResourceId == "" { + if o.SubscriptionID == "" && o.ResourceId == "" && o.InfraClusterName == "" { return nil, fmt.Errorf("subscription-id is required") } // Validate resource group - if o.ResourceGroup == "" && o.ResourceId == "" { + if o.ResourceGroup == "" && o.ResourceId == "" && o.InfraClusterName == "" { return nil, fmt.Errorf("resource-group is required") } - if o.ResourceId != "" && (o.ResourceGroup != "" || o.SubscriptionID != "") { - logger.Info("warning: both resource-id and resource-group/subscription-id are provided, will use resource-id to gather cluster ID") + if o.ResourceId != "" && (o.ResourceGroup != "" || o.SubscriptionID != "" || o.InfraClusterName != "") { + logger.Info("warning: both resource-id and resource-group/subscription-id/infra-cluster-name are provided, will use resource-id to gather cluster ID") } if o.TimestampMin.After(o.TimestampMax) { diff --git a/tooling/hcpctl/pkg/kusto/configurable_query.go b/tooling/hcpctl/pkg/kusto/configurable_query.go index c62a91836b..65a0d656f2 100644 --- a/tooling/hcpctl/pkg/kusto/configurable_query.go +++ b/tooling/hcpctl/pkg/kusto/configurable_query.go @@ -45,6 +45,11 @@ func (q *ConfigurableQuery) WithTable(tableName string) *ConfigurableQuery { return q } +func (q *ConfigurableQuery) WithInfraFields() *ConfigurableQuery { + q.Query.AddLiteral("\n| project timestamp, log, cluster") + return q +} + func (q *ConfigurableQuery) WithDefaultFields() *ConfigurableQuery { q.Query.AddLiteral("\n| project timestamp, log, cluster, namespace_name, container_name") return q @@ -95,3 +100,9 @@ func (q *ConfigurableQuery) WithClusterIdOrSubscriptionAndResourceGroup(clusterI q.Parameters.AddString("subResourceGroupId", fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", subscriptionId, resourceGroup)) return q } + +func (q *ConfigurableQuery) WithCluster(clusterName string) *ConfigurableQuery { + q.Query.AddLiteral("\n| where cluster == clusterName") + q.Parameters.AddString("clusterName", clusterName) + return q +} diff --git a/tooling/hcpctl/pkg/mustgather/gather.go b/tooling/hcpctl/pkg/mustgather/gather.go index a22e80c571..d7be1bfa98 100644 --- a/tooling/hcpctl/pkg/mustgather/gather.go +++ b/tooling/hcpctl/pkg/mustgather/gather.go @@ -84,6 +84,7 @@ type NormalizedLogLine struct { // These options are used to configure the Gatherer and are passed to the Gatherer constructor // They are used to generate the queries as well type GathererOptions struct { + GatherInfraLogs bool // Gather all logs from the infrastructure, does NOT gather HCP logs SkipHostedControlPlaneLogs bool // Skip hosted control plane logs QueryOptions *QueryOptions // Query options } @@ -174,6 +175,7 @@ type Gatherer struct { QueryClient QueryClientInterface outputFunc RowOutputFunc outputOptions RowOutputOptions + infraLogsOnly bool } // NewGatherer creates a new Gatherer with custom output function and options. @@ -218,6 +220,7 @@ func NewGatherer(queryClient QueryClientInterface, outputFunc RowOutputFunc, out outputFunc: outputFunc, outputOptions: outputOptions, opts: opts, + infraLogsOnly: false, } } @@ -238,12 +241,18 @@ func NewCliGatherer(queryClient QueryClientInterface, outputPath, serviceLogsDir outputFunc: cliOutputFunc, outputOptions: outputOptions, opts: opts, + infraLogsOnly: opts.GatherInfraLogs, } } func cliOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { outputPath := options["outputPath"].(string) - directory := options[string(queryType)].(string) + var directory string + var ok bool + if directory, ok = options[string(queryType)].(string); !ok { + directory = "cluster" + } + openedFiles := make(map[string]*os.File) var allErrors error @@ -258,7 +267,12 @@ func cliOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, opt }() for logLine := range logLineChan { - fileName := fmt.Sprintf("%s-%s-%s.log", logLine.Cluster, logLine.Namespace, logLine.ContainerName) + var fileName string + if queryType == QueryTypeKubernetesEvents || queryType == QueryTypeSystemdLogs { + fileName = fmt.Sprintf("%s-%s.log", logLine.Cluster, queryType) + } else { + fileName = fmt.Sprintf("%s-%s-%s.log", logLine.Cluster, logLine.Namespace, logLine.ContainerName) + } file, ok := openedFiles[fileName] if !ok { @@ -281,6 +295,10 @@ func cliOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, opt func (g *Gatherer) GatherLogs(ctx context.Context) error { logger := logr.FromContextOrDiscard(ctx) + if g.infraLogsOnly { + logger.V(1).Info("Gathering infrastructure logs only") + return g.gatherInfraLogs(ctx) + } // First, get all cluster IDs clusterIds, err := g.executeClusterIdQuery(ctx, g.opts.QueryOptions.GetClusterIdQuery()) @@ -308,6 +326,19 @@ func (g *Gatherer) GatherLogs(ctx context.Context) error { return nil } +func (g *Gatherer) gatherInfraLogs(ctx context.Context) error { + if err := g.queryAndWriteToFile(ctx, QueryTypeKubernetesEvents, g.opts.QueryOptions.GetInfraKubernetesEventsQuery()); err != nil { + return fmt.Errorf("failed to execute kubernetes events query: %w", err) + } + if err := g.queryAndWriteToFile(ctx, QueryTypeSystemdLogs, g.opts.QueryOptions.GetInfraSystemdLogsQuery()); err != nil { + return fmt.Errorf("failed to execute systemd logs query: %w", err) + } + if err := g.queryAndWriteToFile(ctx, QueryTypeServices, g.opts.QueryOptions.GetInfraServicesQueries()); err != nil { + return fmt.Errorf("failed to execute services query: %w", err) + } + return nil +} + func (g *Gatherer) executeClusterIdQuery(ctx context.Context, query *kusto.ConfigurableQuery) ([]string, error) { outputChannel := make(chan azkquery.Row) allClusterIds := make([]string, 0) @@ -340,7 +371,6 @@ func (g *Gatherer) executeClusterIdQuery(ctx context.Context, query *kusto.Confi } func (g *Gatherer) queryAndWriteToFile(ctx context.Context, queryType QueryType, queries []*kusto.ConfigurableQuery) error { - // logger := logr.FromContextOrDiscard(ctx) queryOutputChannel := make(chan azkquery.Row) queryGroup := new(errgroup.Group) diff --git a/tooling/hcpctl/pkg/mustgather/queries.go b/tooling/hcpctl/pkg/mustgather/queries.go index 64bfd75a63..6a731a1383 100644 --- a/tooling/hcpctl/pkg/mustgather/queries.go +++ b/tooling/hcpctl/pkg/mustgather/queries.go @@ -29,6 +29,8 @@ const ( QueryTypeServices QueryType = "services" QueryTypeHostedControlPlane QueryType = "hosted-control-plane" QueryTypeClusterId QueryType = "cluster-id" + QueryTypeKubernetesEvents QueryType = "kubernetes-events" + QueryTypeSystemdLogs QueryType = "systemd-logs" ) var servicesDatabase = "ServiceLogs" @@ -53,12 +55,13 @@ type QueryOptions struct { ClusterIds []string SubscriptionId string ResourceGroupName string + InfraClusterName string TimestampMin time.Time TimestampMax time.Time Limit int } -func NewQueryOptions(subscriptionID, resourceGroupName, resourceId string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { +func NewQueryOptions(subscriptionID, resourceGroupName, resourceId, infraClusterName string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { if resourceId != "" { res, err := azcorearm.ParseResourceID(resourceId) if err != nil { @@ -74,9 +77,57 @@ func NewQueryOptions(subscriptionID, resourceGroupName, resourceId string, times TimestampMin: timestampMin, TimestampMax: timestampMax, Limit: limit, + InfraClusterName: infraClusterName, }, nil } +func (opts *QueryOptions) GetInfraKubernetesEventsQuery() []*kusto.ConfigurableQuery { + query := kusto.NewConfigurableQuery("kubernetesEvents", servicesDatabase) + if opts.Limit < 0 { + query.WithNoTruncation() + } + query.WithTable("kubernetesEvents").WithInfraFields() + query.WithCluster(opts.InfraClusterName) + if opts.Limit > 0 { + query.WithLimit(opts.Limit) + } + query.WithOrderByTimestampAsc() + return []*kusto.ConfigurableQuery{query} +} + +func (opts *QueryOptions) GetInfraSystemdLogsQuery() []*kusto.ConfigurableQuery { + query := kusto.NewConfigurableQuery("systemdLogs", servicesDatabase) + if opts.Limit < 0 { + query.WithNoTruncation() + } + query.WithTable("systemdLogs").WithInfraFields() + query.WithCluster(opts.InfraClusterName) + if opts.Limit > 0 { + query.WithLimit(opts.Limit) + } + query.WithOrderByTimestampAsc() + return []*kusto.ConfigurableQuery{query} +} + +func (opts *QueryOptions) GetInfraServicesQueries() []*kusto.ConfigurableQuery { + queries := []*kusto.ConfigurableQuery{} + for _, table := range servicesTables { + query := kusto.NewConfigurableQuery(table, servicesDatabase) + if opts.Limit < 0 { + query.WithNoTruncation() + } + query.WithTable(table).WithDefaultFields() + query.WithTimestampMinAndMax(opts.TimestampMin, opts.TimestampMax) + query.WithCluster(opts.InfraClusterName) + if opts.Limit > 0 { + query.WithLimit(opts.Limit) + } + query.WithOrderByTimestampAsc() + queries = append(queries, query) + } + return queries +} + func (opts *QueryOptions) GetServicesQueries() []*kusto.ConfigurableQuery { queries := []*kusto.ConfigurableQuery{} for _, table := range servicesTables { diff --git a/tooling/hcpctl/pkg/mustgather/queries_test.go b/tooling/hcpctl/pkg/mustgather/queries_test.go index 620c4d7671..02fa7c1c40 100644 --- a/tooling/hcpctl/pkg/mustgather/queries_test.go +++ b/tooling/hcpctl/pkg/mustgather/queries_test.go @@ -26,19 +26,19 @@ func TestNewQueryOptions(t *testing.T) { now := time.Now() // With resource ID - opts, err := NewQueryOptions("", "", "/subscriptions/test-sub/resourceGroups/test-rg", now, now, 100) + opts, err := NewQueryOptions("", "", "/subscriptions/test-sub/resourceGroups/test-rg", "", now, now, 100) require.NoError(t, err) assert.Equal(t, "test-sub", opts.SubscriptionId) assert.Equal(t, "test-rg", opts.ResourceGroupName) // With subscription/resource group - opts, err = NewQueryOptions("sub", "rg", "", now, now, 100) + opts, err = NewQueryOptions("sub", "rg", "", "", now, now, 100) require.NoError(t, err) assert.Equal(t, "sub", opts.SubscriptionId) assert.Equal(t, "rg", opts.ResourceGroupName) // Invalid resource ID - _, err = NewQueryOptions("", "", "/invalid", now, now, 100) + _, err = NewQueryOptions("", "", "/invalid", "", now, now, 100) assert.Error(t, err) } From b35f41ed9c12600af0e8814d5ccbc5c51bbfbd47 Mon Sep 17 00:00:00 2001 From: Jan-Hendrik Boll Date: Mon, 23 Feb 2026 10:48:17 +0000 Subject: [PATCH 2/4] Fix concurrency in must-gather code Errors could cause deadlocks in must-gather code. Reason was missing handling of context cancelations and lack of error fetching. This refactoring now introduces the missing concepts. --- test/util/verifiers/kusto.go | 3 +- tooling/hcpctl/cmd/must-gather/cmd.go | 1 + .../hcpctl/cmd/must-gather/query_options.go | 5 + tooling/hcpctl/pkg/kusto/client.go | 10 +- tooling/hcpctl/pkg/mustgather/gather.go | 118 +++++++++--------- tooling/hcpctl/pkg/mustgather/gather_test.go | 20 ++- tooling/hcpctl/pkg/mustgather/queryclient.go | 36 ++---- .../hcpctl/pkg/mustgather/queryclient_test.go | 14 +-- 8 files changed, 103 insertions(+), 104 deletions(-) diff --git a/test/util/verifiers/kusto.go b/test/util/verifiers/kusto.go index 450d451da6..bc8a79ca7b 100644 --- a/test/util/verifiers/kusto.go +++ b/test/util/verifiers/kusto.go @@ -78,6 +78,7 @@ func (v verifyMustGatherLogsImpl) Verify(ctx context.Context) error { v.config.SubscriptionID, v.config.ResourceGroup, "", // resourceId + "", // infraClusterName time.Now().Add(-24*time.Hour), // timestampMin time.Now(), // timestampMax -1, // limit: -1 means no truncation @@ -89,7 +90,7 @@ func (v verifyMustGatherLogsImpl) Verify(ctx context.Context) error { foundLogSources := make(map[string]bool) var foundMutex sync.Mutex - outputFunc := func(logLineChan chan *mustgather.NormalizedLogLine, queryType mustgather.QueryType, options mustgather.RowOutputOptions) error { + outputFunc := func(ctx context.Context, logLineChan chan *mustgather.NormalizedLogLine, queryType mustgather.QueryType, options mustgather.RowOutputOptions) error { for logLine := range logLineChan { // Create a key for namespace/container combination key := fmt.Sprintf("%s/%s", logLine.Namespace, logLine.ContainerName) diff --git a/tooling/hcpctl/cmd/must-gather/cmd.go b/tooling/hcpctl/cmd/must-gather/cmd.go index c8a32faa1e..c84575b52b 100644 --- a/tooling/hcpctl/cmd/must-gather/cmd.go +++ b/tooling/hcpctl/cmd/must-gather/cmd.go @@ -20,6 +20,7 @@ import ( var ServicesLogDirectory = "service" var HostedControlPlaneLogDirectory = "hosted-control-plane" +var InfraLogDirectory = "cluster" var OptionsOutputFile = "options.json" diff --git a/tooling/hcpctl/cmd/must-gather/query_options.go b/tooling/hcpctl/cmd/must-gather/query_options.go index 9781d5069c..a880190cab 100644 --- a/tooling/hcpctl/cmd/must-gather/query_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_options.go @@ -187,6 +187,11 @@ func (o *ValidatedMustGatherOptions) Complete(ctx context.Context) (*MustGatherO return nil, fmt.Errorf("failed to create service logs directory: %w", err) } + err = os.MkdirAll(path.Join(o.OutputPath, InfraLogDirectory), 0755) + if err != nil { + return nil, fmt.Errorf("failed to create infrastructure logs directory: %w", err) + } + if !o.SkipHostedControlPlaneLogs { err = os.MkdirAll(path.Join(o.OutputPath, HostedControlPlaneLogDirectory), 0755) if err != nil { diff --git a/tooling/hcpctl/pkg/kusto/client.go b/tooling/hcpctl/pkg/kusto/client.go index 7b24871390..b64362a5ee 100644 --- a/tooling/hcpctl/pkg/kusto/client.go +++ b/tooling/hcpctl/pkg/kusto/client.go @@ -109,6 +109,7 @@ func (c *Client) ExecutePreconfiguredQuery(ctx context.Context, query *Configura startTime := time.Now() // Process the first table (primary result) + logger.V(6).Info("Processing primary result") primaryResult := <-dataset.Tables() err = primaryResult.Err() @@ -122,6 +123,7 @@ func (c *Client) ExecutePreconfiguredQuery(ctx context.Context, query *Configura columsSet := false for row := range primaryResult.Table().Rows() { + logger.V(8).Info("Processing row", "rowNumber", totalRows) row := row.Row() if row == nil { if query.Unlimited { @@ -133,14 +135,18 @@ func (c *Client) ExecutePreconfiguredQuery(ctx context.Context, query *Configura columns = row.Columns() columsSet = true } - outputChannel <- row + select { + case <-ctx.Done(): + return nil, ctx.Err() + case outputChannel <- row: + } totalRows++ dataSize += int64(len(fmt.Sprintf("%v", row))) } executionTime := time.Since(startTime) - logger.V(1).Info("Query competed", "query", query.Name, "rows", totalRows, "KiloBytes", dataSize/1024, "executionTime", executionTime) + logger.V(1).Info("Query completed", "query", query.Name, "rows", totalRows, "KiloBytes", dataSize/1024, "executionTime", executionTime) return &QueryResult{ Columns: columns, diff --git a/tooling/hcpctl/pkg/mustgather/gather.go b/tooling/hcpctl/pkg/mustgather/gather.go index d7be1bfa98..d63541cf58 100644 --- a/tooling/hcpctl/pkg/mustgather/gather.go +++ b/tooling/hcpctl/pkg/mustgather/gather.go @@ -50,7 +50,7 @@ type RowOutputOptions map[string]any // // Custom implementations can output to files, databases, APIs, or any other destination. // The channel will be closed by the caller when all data has been sent. -type RowOutputFunc func(logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error +type RowOutputFunc func(ctx context.Context, logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error // NormalizedLogLine represents a single log entry with standardized fields. // This structure is passed to RowOutputFunc implementations for processing. @@ -245,7 +245,7 @@ func NewCliGatherer(queryClient QueryClientInterface, outputPath, serviceLogsDir } } -func cliOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { +func cliOutputFunc(ctx context.Context, logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { outputPath := options["outputPath"].(string) var directory string var ok bool @@ -265,31 +265,36 @@ func cliOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, opt } } }() - - for logLine := range logLineChan { - var fileName string - if queryType == QueryTypeKubernetesEvents || queryType == QueryTypeSystemdLogs { - fileName = fmt.Sprintf("%s-%s.log", logLine.Cluster, queryType) - } else { - fileName = fmt.Sprintf("%s-%s-%s.log", logLine.Cluster, logLine.Namespace, logLine.ContainerName) - } - - file, ok := openedFiles[fileName] - if !ok { - newFile, err := os.Create(path.Join(outputPath, directory, fileName)) - if err != nil { - allErrors = errors.Join(allErrors, fmt.Errorf("failed to create output file: %w", err)) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case logLine, ok := <-logLineChan: + if !ok { return allErrors } - openedFiles[fileName] = newFile - file = newFile - } + var fileName string + if queryType == QueryTypeKubernetesEvents || queryType == QueryTypeSystemdLogs { + fileName = fmt.Sprintf("%s-%s.log", logLine.Cluster, queryType) + } else { + fileName = fmt.Sprintf("%s-%s-%s.log", logLine.Cluster, logLine.Namespace, logLine.ContainerName) + } - if _, err := fmt.Fprintf(file, "%s\n", string(logLine.Log)); err != nil { - allErrors = errors.Join(allErrors, fmt.Errorf("failed to write to file %s: %w", fileName, err)) - continue + file, ok := openedFiles[fileName] + if !ok { + newFile, err := os.Create(path.Join(outputPath, directory, fileName)) + if err != nil { + return errors.Join(allErrors, fmt.Errorf("failed to create output file: %w", err)) + } + openedFiles[fileName] = newFile + file = newFile + } + if _, err := fmt.Fprintf(file, "%s\n", string(logLine.Log)); err != nil { + allErrors = errors.Join(allErrors, fmt.Errorf("failed to write to file %s: %w", fileName, err)) + } } } + return allErrors } @@ -371,54 +376,53 @@ func (g *Gatherer) executeClusterIdQuery(ctx context.Context, query *kusto.Confi } func (g *Gatherer) queryAndWriteToFile(ctx context.Context, queryType QueryType, queries []*kusto.ConfigurableQuery) error { + logger := logr.FromContextOrDiscard(ctx) queryOutputChannel := make(chan azkquery.Row) + logLineChan := make(chan *NormalizedLogLine) - queryGroup := new(errgroup.Group) + logger.V(6).Info("Executing query", "queryType", queryType, "queries", len(queries), "queries", queries) + + queryGroup, queryCtx := errgroup.WithContext(ctx) queryGroup.Go(func() error { - return g.QueryClient.ConcurrentQueries(ctx, queries, queryOutputChannel) + defer close(queryOutputChannel) + return g.QueryClient.ConcurrentQueries(queryCtx, queries, queryOutputChannel) }) - consumerGroup := new(errgroup.Group) - consumerGroup.Go(func() error { - return g.convertRowsAndOutput(queryOutputChannel, queryType) + queryGroup.Go(func() error { + return g.outputFunc(queryCtx, logLineChan, queryType, g.outputOptions) + }) + + queryGroup.Go(func() error { + defer close(logLineChan) + return g.convertRows(queryCtx, queryOutputChannel, logLineChan) }) + logger.V(6).Info("Waiting for query to complete", "queryType", queryType) if err := queryGroup.Wait(); err != nil { return fmt.Errorf("error during query execution: %w", err) } - close(queryOutputChannel) - if err := consumerGroup.Wait(); err != nil { - return fmt.Errorf("error during query data transformation: %w", err) - } + return nil } -func (g *Gatherer) convertRowsAndOutput(outputChannel <-chan azkquery.Row, queryType QueryType) error { - logLineChan := make(chan *NormalizedLogLine) - - // Start output processing in background - outputErrChan := make(chan error, 1) - go func() { - outputErrChan <- g.outputFunc(logLineChan, queryType, g.outputOptions) - }() - - // Process rows and send to output - for row := range outputChannel { - normalizedLogLine := &NormalizedLogLine{} - if err := row.ToStruct(normalizedLogLine); err != nil { - close(logLineChan) - return fmt.Errorf("failed to convert row to struct: %w", err) +func (g *Gatherer) convertRows(ctx context.Context, rowChannel <-chan azkquery.Row, outPutChannel chan<- *NormalizedLogLine) error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case row, ok := <-rowChannel: + if !ok { + return nil + } + normalizedLogLine := &NormalizedLogLine{} + if err := row.ToStruct(normalizedLogLine); err != nil { + return fmt.Errorf("failed to convert row to struct: %w", err) + } + select { + case <-ctx.Done(): + return ctx.Err() + case outPutChannel <- normalizedLogLine: // now interruptible + } } - logLineChan <- normalizedLogLine } - - // Close the channel to signal completion to the output function - close(logLineChan) - - // Wait for output processing to complete and check for errors - if outputErr := <-outputErrChan; outputErr != nil { - return fmt.Errorf("failed to output data: %w", outputErr) - } - - return nil } diff --git a/tooling/hcpctl/pkg/mustgather/gather_test.go b/tooling/hcpctl/pkg/mustgather/gather_test.go index 88bce33b29..ae64e3403e 100644 --- a/tooling/hcpctl/pkg/mustgather/gather_test.go +++ b/tooling/hcpctl/pkg/mustgather/gather_test.go @@ -55,7 +55,7 @@ func (m *MockQueryClient) ExecutePreconfiguredQuery(ctx context.Context, query * return result.(*kusto.QueryResult), args.Error(1) } -func mockOutputFunc(logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { +func mockOutputFunc(ctx context.Context, logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { for range logLineChan { // Consume all messages } @@ -77,7 +77,7 @@ func TestNewGatherer(t *testing.T) { assert.Equal(t, mockQueryClient, gatherer.QueryClient) // Test custom gatherer - customOutputFunc := func(logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { + customOutputFunc := func(ctx context.Context, logLineChan chan *NormalizedLogLine, queryType QueryType, options RowOutputOptions) error { for range logLineChan { } return nil @@ -102,19 +102,17 @@ func TestGatherer_GatherLogs(t *testing.T) { outputOptions: RowOutputOptions{"outputPath": "/test"}, } - ctx := context.Background() - // Success case - mockQueryClient.On("ExecutePreconfiguredQuery", ctx, mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(&kusto.QueryResult{}, nil).Once() - mockQueryClient.On("ConcurrentQueries", ctx, mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil).Twice() + mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(&kusto.QueryResult{}, nil).Once() + mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil).Twice() - err := gatherer.GatherLogs(ctx) + err := gatherer.GatherLogs(t.Context()) assert.NoError(t, err) // Error case - mockQueryClient.On("ExecutePreconfiguredQuery", ctx, mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(nil, errors.New("query failed")) + mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(nil, errors.New("query failed")) - err = gatherer.GatherLogs(ctx) + err = gatherer.GatherLogs(t.Context()) assert.Error(t, err) mockQueryClient.AssertExpectations(t) @@ -146,7 +144,7 @@ func TestCliOutputFunc(t *testing.T) { close(logLineChan) }() - err = cliOutputFunc(logLineChan, QueryTypeServices, options) + err = cliOutputFunc(t.Context(), logLineChan, QueryTypeServices, options) assert.NoError(t, err) // Verify file was created and contains log @@ -174,6 +172,6 @@ func TestCliOutputFunc(t *testing.T) { close(logLineChan) }() - err = cliOutputFunc(logLineChan, QueryTypeServices, badOptions) + err = cliOutputFunc(t.Context(), logLineChan, QueryTypeServices, badOptions) assert.Error(t, err) } diff --git a/tooling/hcpctl/pkg/mustgather/queryclient.go b/tooling/hcpctl/pkg/mustgather/queryclient.go index 6b7a7638f2..9e6da2318a 100644 --- a/tooling/hcpctl/pkg/mustgather/queryclient.go +++ b/tooling/hcpctl/pkg/mustgather/queryclient.go @@ -16,12 +16,11 @@ package mustgather import ( "context" - "errors" "fmt" - "sync" "time" "github.com/go-logr/logr" + "golang.org/x/sync/errgroup" azkquery "github.com/Azure/azure-kusto-go/azkustodata/query" @@ -64,41 +63,26 @@ func NewQueryClientWithFileWriter(client kusto.KustoClient, queryTimeout time.Du func (q *QueryClient) ConcurrentQueries(ctx context.Context, queries []*kusto.ConfigurableQuery, outputChannel chan<- azkquery.Row) error { logger := logr.FromContextOrDiscard(ctx) - wg := sync.WaitGroup{} - wg.Add(len(queries)) - errorCh := make(chan error, len(queries)) - - for i, query := range queries { - go func(query *kusto.ConfigurableQuery, queryIndex int) { - defer wg.Done() - result, err := q.Client.ExecutePreconfiguredQuery(ctx, query, outputChannel) + queryGroup, queryCtx := errgroup.WithContext(ctx) + for _, query := range queries { + queryGroup.Go(func() error { + result, err := q.Client.ExecutePreconfiguredQuery(queryCtx, query, outputChannel) if err != nil { logger.Error(err, "Query failed", "name", query.Name) - errorCh <- fmt.Errorf("failed to execute query: %w", err) - return + return fmt.Errorf("failed to execute query: %w", err) } if q.FileWriter != nil { err = q.FileWriter.WriteFile(q.OutputPath, fmt.Sprintf("%s.json", query.Name), result) if err != nil { - errorCh <- fmt.Errorf("failed to write query result to file: %w", err) + return fmt.Errorf("failed to write query result to file: %w", err) } } - }(query, i) - } - - wg.Wait() - close(errorCh) - - var allErrors error - for err := range errorCh { - allErrors = errors.Join(allErrors, err) - } - if allErrors != nil { - return fmt.Errorf("failed to execute queries: %w", allErrors) + return nil + }) } - return nil + return queryGroup.Wait() } func (q *QueryClient) Close() error { diff --git a/tooling/hcpctl/pkg/mustgather/queryclient_test.go b/tooling/hcpctl/pkg/mustgather/queryclient_test.go index a358fa0de3..2fc91844bf 100644 --- a/tooling/hcpctl/pkg/mustgather/queryclient_test.go +++ b/tooling/hcpctl/pkg/mustgather/queryclient_test.go @@ -138,8 +138,8 @@ func TestQueryClient_ConcurrentQueries_Success(t *testing.T) { } // Set up mock expectations - mockClient.On("ExecutePreconfiguredQuery", ctx, query1, mock.Anything).Return(result1, nil) - mockClient.On("ExecutePreconfiguredQuery", ctx, query2, mock.Anything).Return(result2, nil) + mockClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), query1, mock.Anything).Return(result1, nil) + mockClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), query2, mock.Anything).Return(result2, nil) mockFileWriter.On("WriteFile", "/test/output", "query1.json", result1).Return(nil) mockFileWriter.On("WriteFile", "/test/output", "query2.json", result2).Return(nil) @@ -169,7 +169,7 @@ func TestQueryClient_ConcurrentQueries_QueryExecutionError(t *testing.T) { queries := []*kusto.ConfigurableQuery{query} expectedError := errors.New("query execution failed") - mockClient.On("ExecutePreconfiguredQuery", ctx, query, mock.Anything).Return((*kusto.QueryResult)(nil), expectedError) + mockClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), query, mock.Anything).Return((*kusto.QueryResult)(nil), expectedError) queryClient := &QueryClient{ Client: mockClient, @@ -180,7 +180,7 @@ func TestQueryClient_ConcurrentQueries_QueryExecutionError(t *testing.T) { err := queryClient.ConcurrentQueries(ctx, queries, outputChannel) assert.Error(t, err) - assert.Contains(t, err.Error(), "failed to execute queries") + assert.Contains(t, err.Error(), "failed to execute query") mockClient.AssertExpectations(t) mockFileWriter.AssertExpectations(t) } @@ -202,7 +202,7 @@ func TestQueryClient_ConcurrentQueries_FileWriteError(t *testing.T) { } expectedWriteError := errors.New("file write failed") - mockClient.On("ExecutePreconfiguredQuery", ctx, query, mock.Anything).Return(result, nil) + mockClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), query, mock.Anything).Return(result, nil) mockFileWriter.On("WriteFile", "/test/output", "query_with_write_error.json", result).Return(expectedWriteError) queryClient := &QueryClient{ @@ -214,7 +214,7 @@ func TestQueryClient_ConcurrentQueries_FileWriteError(t *testing.T) { err := queryClient.ConcurrentQueries(ctx, queries, outputChannel) assert.Error(t, err) - assert.Contains(t, err.Error(), "failed to execute queries") + assert.Contains(t, err.Error(), "failed to write query result to file") mockClient.AssertExpectations(t) mockFileWriter.AssertExpectations(t) } @@ -271,7 +271,7 @@ func TestQueryClient_ConcurrentQueries_Concurrency(t *testing.T) { query := queries[i] result := results[i] - mockClient.On("ExecutePreconfiguredQuery", ctx, query, mock.Anything).Run(func(args mock.Arguments) { + mockClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), query, mock.Anything).Run(func(args mock.Arguments) { mu.Lock() executionTimes[query.Name] = time.Now() mu.Unlock() From aa50e560735c4f4d430ac93902a5e33f5aab47ef Mon Sep 17 00:00:00 2001 From: Jan-Hendrik Boll Date: Wed, 25 Feb 2026 09:40:28 +0000 Subject: [PATCH 3/4] Refactor cli interface --- tooling/hcpctl/README.md | 25 ++- tooling/hcpctl/cmd/must-gather/cmd.go | 9 +- .../cmd/must-gather/legacy_query_cmd.go | 22 +-- .../cmd/must-gather/query_base_options.go | 123 ++++++++++++ tooling/hcpctl/cmd/must-gather/query_cmd.go | 10 +- .../hcpctl/cmd/must-gather/query_infra_cmd.go | 104 ++++++++++ .../cmd/must-gather/query_infra_options.go | 112 +++++++++++ .../hcpctl/cmd/must-gather/query_options.go | 182 ++++++------------ tooling/hcpctl/pkg/mustgather/queries.go | 19 +- tooling/hcpctl/pkg/mustgather/queries_test.go | 6 +- 10 files changed, 461 insertions(+), 151 deletions(-) create mode 100644 tooling/hcpctl/cmd/must-gather/query_base_options.go create mode 100644 tooling/hcpctl/cmd/must-gather/query_infra_cmd.go create mode 100644 tooling/hcpctl/cmd/must-gather/query_infra_options.go diff --git a/tooling/hcpctl/README.md b/tooling/hcpctl/README.md index 92ac145311..54419fefa3 100644 --- a/tooling/hcpctl/README.md +++ b/tooling/hcpctl/README.md @@ -120,10 +120,12 @@ hcpctl hcp breakglass 12345678-1234-1234-1234-123456789abc --privileged ## Gather logs from Kusto -You can gather logs for a managed cluster from Kusto. You need to be logged into Azure to access Kusto. You need to set kusto and region to point to the Kusto instance containing the desired logs. +### Gather Managed cluster logs + +This is the usual use case for must-gather kust. You can gather logs for a managed cluster from Kusto. You need to be logged into Azure to access Kusto. You need to set kusto and region to point to the Kusto instance containing the desired logs. ```bash -hcpctl must-gather legacy-query --kusto $kusto --region $region --subscription-id $subscription_id --resource-group $resource_group +hcpctl must-gather query --kusto $kusto --region $region --subscription-id $subscription_id --resource-group $resource_group ``` If you get an error like, limit execeeded try reducing the amount of data by setting either limit or timestamps, i.e.: @@ -131,12 +133,29 @@ If you get an error like, limit execeeded try reducing the amount of data by set Set `--limit` fetch the first `$limit` number of rows. ```bash -hcpctl must-gather legacy-query \ +hcpctl must-gather query \ --kusto aroint --region eastus \ --subscription-id $subscription_id --resource-group $resource_group --limit 10000 ``` +The parameters $resource_group and $subscription_id must point to the managed cluster, not the AKS cluster running this HCP/Service. + +### Gather infra cluster logs + +If you want to gather all Kusto logs for a given infra cluster (servicecluster or management), you can run + +```bash +hcpctl must-gather query-infra \ + --kusto aroint --region eastus \ + --service-cluster $svc_cluster_name \ + --mgmt-cluster $mgmt_cluster_name \ + --limit 10000 + +``` + +You can provide multiple `service-cluster` parameters and multiple `mgmt-cluster`. Logs will be collected sequentially and stored in a single folder for all clusters provided. + ## TODO - use the Hypershift generated clientsets instead of dedicated schema registration diff --git a/tooling/hcpctl/cmd/must-gather/cmd.go b/tooling/hcpctl/cmd/must-gather/cmd.go index c84575b52b..941e2a372d 100644 --- a/tooling/hcpctl/cmd/must-gather/cmd.go +++ b/tooling/hcpctl/cmd/must-gather/cmd.go @@ -47,7 +47,14 @@ and collecting diagnostic data for troubleshooting and analysis.`, } cmd.AddCommand(queryCmd) - // Add query subcommand + // Add query-infra subcommand + queryInfraCmd, err := newQueryInfraCommand() + if err != nil { + return nil, err + } + cmd.AddCommand(queryInfraCmd) + + // Add legacy-query subcommand queryCmdLegacy, err := newQueryCommandLegacy() if err != nil { return nil, err diff --git a/tooling/hcpctl/cmd/must-gather/legacy_query_cmd.go b/tooling/hcpctl/cmd/must-gather/legacy_query_cmd.go index 46071d291e..da5ef39529 100644 --- a/tooling/hcpctl/cmd/must-gather/legacy_query_cmd.go +++ b/tooling/hcpctl/cmd/must-gather/legacy_query_cmd.go @@ -44,7 +44,7 @@ type LegacyNormalizedLogLine struct { } func newQueryCommandLegacy() (*cobra.Command, error) { - opts := DefaultMustGatherOptions() + opts := DefaultQueryOptions() cmd := &cobra.Command{ Use: "legacy-query", @@ -58,13 +58,13 @@ func newQueryCommandLegacy() (*cobra.Command, error) { return opts.Run(cmd.Context(), true) }, } - if err := BindMustGatherOptions(opts, cmd); err != nil { + if err := BindQueryOptions(opts, cmd); err != nil { return nil, err } return cmd, nil } -func (opts *MustGatherOptions) RunLegacy(ctx context.Context) error { +func (opts *CompletedQueryOptions) RunLegacy(ctx context.Context) error { logger := logr.FromContextOrDiscard(ctx) clusterIds, err := executeClusterIdQuery(ctx, opts, GetKubeSystemClusterIdQuery(opts)) if err != nil { @@ -112,17 +112,17 @@ func processKubesystemLogsRow(row *KubesystemLogsRow) error { return nil } -func executeKubeSystemQueries(ctx context.Context, opts *MustGatherOptions, queryOpts mustgather.QueryOptions) error { +func executeKubeSystemQueries(ctx context.Context, opts *CompletedQueryOptions, queryOpts mustgather.QueryOptions) error { query := GetKubeSystemQuery(opts, queryOpts.ClusterIds) return castQueryAndWriteToFile(ctx, opts, ServicesLogDirectory, []*kusto.ConfigurableQuery{query}) } -func executeKubeSystemHostedControlPlaneLogsQuery(ctx context.Context, opts *MustGatherOptions) error { +func executeKubeSystemHostedControlPlaneLogsQuery(ctx context.Context, opts *CompletedQueryOptions) error { query := GetKubeSystemHostedControlPlaneLogsQuery(opts) return castQueryAndWriteToFile(ctx, opts, HostedControlPlaneLogDirectory, query) } -func castQueryAndWriteToFile(ctx context.Context, opts *MustGatherOptions, targetDirectory string, queries []*kusto.ConfigurableQuery) error { +func castQueryAndWriteToFile(ctx context.Context, opts *CompletedQueryOptions, targetDirectory string, queries []*kusto.ConfigurableQuery) error { castFunction := func(input azkquery.Row) (*LegacyNormalizedLogLine, error) { // can directly cast, cause the row is already normalized legacyLogLine := &KubesystemLogsRow{} @@ -153,15 +153,15 @@ type KubesystemLogsRow struct { Kubernetes string `kusto:"kubernetes"` } -func GetKubeSystemClusterIdQuery(opts *MustGatherOptions) *kusto.ConfigurableQuery { +func GetKubeSystemClusterIdQuery(opts *CompletedQueryOptions) *kusto.ConfigurableQuery { return kusto.NewLegacyClusterIdQuery(opts.SubscriptionID, opts.ResourceGroup, opts.TimestampMin, opts.TimestampMax, opts.Limit) } -func GetKubeSystemQuery(opts *MustGatherOptions, clusterIds []string) *kusto.ConfigurableQuery { +func GetKubeSystemQuery(opts *CompletedQueryOptions, clusterIds []string) *kusto.ConfigurableQuery { return kusto.NewKubeSystemQuery(opts.SubscriptionID, opts.ResourceGroup, clusterIds, opts.TimestampMin, opts.TimestampMax, opts.Limit) } -func GetKubeSystemHostedControlPlaneLogsQuery(opts *MustGatherOptions) []*kusto.ConfigurableQuery { +func GetKubeSystemHostedControlPlaneLogsQuery(opts *CompletedQueryOptions) []*kusto.ConfigurableQuery { queries := []*kusto.ConfigurableQuery{} for _, clusterId := range opts.QueryOptions.ClusterIds { query := kusto.NewCustomerKubeSystemQuery(clusterId, opts.TimestampMin, opts.TimestampMax, opts.Limit) @@ -170,7 +170,7 @@ func GetKubeSystemHostedControlPlaneLogsQuery(opts *MustGatherOptions) []*kusto. return queries } -func queryAndWriteToFile(ctx context.Context, opts *MustGatherOptions, targetDirectory string, castFunction func(input azkquery.Row) (*LegacyNormalizedLogLine, error), queries []*kusto.ConfigurableQuery) error { +func queryAndWriteToFile(ctx context.Context, opts *CompletedQueryOptions, targetDirectory string, castFunction func(input azkquery.Row) (*LegacyNormalizedLogLine, error), queries []*kusto.ConfigurableQuery) error { // logger := logr.FromContextOrDiscard(ctx) queryOutputChannel := make(chan azkquery.Row) @@ -219,7 +219,7 @@ func writeNormalizedLogsToFile(outputChannel chan azkquery.Row, castFunction fun return allErrors } -func executeClusterIdQuery(ctx context.Context, opts *MustGatherOptions, query *kusto.ConfigurableQuery) ([]string, error) { +func executeClusterIdQuery(ctx context.Context, opts *CompletedQueryOptions, query *kusto.ConfigurableQuery) ([]string, error) { outputChannel := make(chan azkquery.Row) allClusterIds := make([]string, 0) diff --git a/tooling/hcpctl/cmd/must-gather/query_base_options.go b/tooling/hcpctl/cmd/must-gather/query_base_options.go new file mode 100644 index 0000000000..7765afb6e9 --- /dev/null +++ b/tooling/hcpctl/cmd/must-gather/query_base_options.go @@ -0,0 +1,123 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mustgather + +import ( + "fmt" + "net/url" + "os" + "path" + "time" + + "github.com/spf13/cobra" + + "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/kusto" + "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/mustgather" +) + +// BaseGatherOptions holds configuration shared across all query commands. +type BaseGatherOptions struct { + Kusto string // Name of the Azure Data Explorer cluster + Region string // Region of the Azure Data Explorer cluster + OutputPath string // Path to write the output file + QueryTimeout time.Duration // Timeout for query execution + TimestampMin time.Time // Timestamp minimum + TimestampMax time.Time // Timestamp maximum + Limit int // Limit the number of results +} + +// DefaultBaseGatherOptions returns BaseGatherOptions initialized with sensible defaults. +func DefaultBaseGatherOptions() BaseGatherOptions { + return BaseGatherOptions{ + QueryTimeout: 5 * time.Minute, + TimestampMin: time.Now().Add(-24 * time.Hour), + TimestampMax: time.Now(), + Limit: -1, // defaults to no limit + OutputPath: fmt.Sprintf("must-gather-%s", time.Now().Format("20060102-150405")), + } +} + +// BindBaseGatherOptions configures cobra command flags for the shared gather options. +func BindBaseGatherOptions(opts *BaseGatherOptions, cmd *cobra.Command) error { + cmd.Flags().StringVar(&opts.Kusto, "kusto", opts.Kusto, "Azure Data Explorer cluster name (required)") + cmd.Flags().StringVar(&opts.Region, "region", opts.Region, "Azure Data Explorer cluster region (required)") + cmd.Flags().DurationVar(&opts.QueryTimeout, "query-timeout", opts.QueryTimeout, "timeout for query execution") + cmd.Flags().StringVar(&opts.OutputPath, "output-path", opts.OutputPath, "path to write the output file") + cmd.Flags().TimeVar(&opts.TimestampMin, "timestamp-min", opts.TimestampMin, []string{time.DateTime}, "timestamp minimum") + cmd.Flags().TimeVar(&opts.TimestampMax, "timestamp-max", opts.TimestampMax, []string{time.DateTime}, "timestamp maximum") + cmd.Flags().IntVar(&opts.Limit, "limit", opts.Limit, "limit the number of results") + + requiredFlags := []string{"kusto", "region"} + for _, flag := range requiredFlags { + if err := cmd.MarkFlagRequired(flag); err != nil { + return fmt.Errorf("failed to mark %s as required: %w", flag, err) + } + } + + return nil +} + +// validateBaseGatherOptions validates the shared gather options and returns a kusto endpoint URL. +func validateBaseGatherOptions(opts *BaseGatherOptions) (*url.URL, error) { + if opts.Kusto == "" { + return nil, fmt.Errorf("kusto is required") + } + if opts.Region == "" { + return nil, fmt.Errorf("region is required") + } + + kustoEndpoint, err := kusto.KustoEndpoint(opts.Kusto, opts.Region) + if err != nil { + return nil, fmt.Errorf("failed to create Kusto endpoint: %w", err) + } + + if opts.QueryTimeout < 30*time.Second { + return nil, fmt.Errorf("query timeout must be at least 30 seconds") + } + if opts.QueryTimeout > 30*time.Minute { + return nil, fmt.Errorf("query timeout cannot exceed 30 minutes") + } + + if opts.TimestampMin.After(opts.TimestampMax) { + return nil, fmt.Errorf("timestamp-min cannot be after timestamp-max") + } + + return kustoEndpoint, nil +} + +// completeBaseGatherOptions creates the kusto client. +func completeBaseGatherOptions(kustoEndpoint *url.URL, queryTimeout time.Duration, outputPath string) (mustgather.QueryClientInterface, error) { + client, err := kusto.NewClient(kustoEndpoint, queryTimeout) + if err != nil { + return nil, fmt.Errorf("failed to create Kusto client: %w", err) + } + return mustgather.NewQueryClient(client, queryTimeout, outputPath), nil +} + +// createOutputDirectories creates the output directory structure. +func createOutputDirectories(outputPath string, skipHCPDir bool) error { + if err := os.MkdirAll(path.Join(outputPath, ServicesLogDirectory), 0755); err != nil { + return fmt.Errorf("failed to create service logs directory: %w", err) + } + if err := os.MkdirAll(path.Join(outputPath, InfraLogDirectory), 0755); err != nil { + return fmt.Errorf("failed to create infrastructure logs directory: %w", err) + } + if !skipHCPDir { + if err := os.MkdirAll(path.Join(outputPath, HostedControlPlaneLogDirectory), 0755); err != nil { + return fmt.Errorf("failed to create customer logs directory: %w", err) + } + } + return nil +} diff --git a/tooling/hcpctl/cmd/must-gather/query_cmd.go b/tooling/hcpctl/cmd/must-gather/query_cmd.go index f42ed75b20..6431a23e14 100644 --- a/tooling/hcpctl/cmd/must-gather/query_cmd.go +++ b/tooling/hcpctl/cmd/must-gather/query_cmd.go @@ -25,7 +25,7 @@ import ( ) func newQueryCommand() (*cobra.Command, error) { - opts := DefaultMustGatherOptions() + opts := DefaultQueryOptions() cmd := &cobra.Command{ Use: "query", @@ -43,14 +43,14 @@ func newQueryCommand() (*cobra.Command, error) { }, } - if err := BindMustGatherOptions(opts, cmd); err != nil { + if err := BindQueryOptions(opts, cmd); err != nil { return nil, err } return cmd, nil } -func (opts *MustGatherOptions) Run(ctx context.Context) error { +func (opts *CompletedQueryOptions) RunQuery(ctx context.Context) error { logger := logr.FromContextOrDiscard(ctx) defer func() { if closeErr := opts.QueryClient.Close(); closeErr != nil { @@ -58,7 +58,7 @@ func (opts *MustGatherOptions) Run(ctx context.Context) error { } }() - queryOptions, err := mustgather.NewQueryOptions(opts.SubscriptionID, opts.ResourceGroup, opts.ResourceId, opts.InfraClusterName, opts.TimestampMin, opts.TimestampMax, opts.Limit) + queryOptions, err := mustgather.NewQueryOptions(opts.SubscriptionID, opts.ResourceGroup, opts.ResourceId, opts.TimestampMin, opts.TimestampMax, opts.Limit) if err != nil { return fmt.Errorf("failed to create query options: %w", err) } @@ -66,7 +66,7 @@ func (opts *MustGatherOptions) Run(ctx context.Context) error { gatherer := mustgather.NewCliGatherer(opts.QueryClient, opts.OutputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ QueryOptions: queryOptions, SkipHostedControlPlaneLogs: opts.SkipHostedControlPlaneLogs, - GatherInfraLogs: opts.GatherInfraLogs, + GatherInfraLogs: false, }) err = gatherer.GatherLogs(ctx) diff --git a/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go b/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go new file mode 100644 index 0000000000..3f15de1a0f --- /dev/null +++ b/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go @@ -0,0 +1,104 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mustgather + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "github.com/spf13/cobra" + + "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/mustgather" +) + +func newQueryInfraCommand() (*cobra.Command, error) { + opts := DefaultInfraQueryOptions() + + cmd := &cobra.Command{ + Use: "query-infra", + Short: "Execute infrastructure queries against Azure Data Explorer", + Long: `Execute preconfigured infrastructure queries against Azure Data Explorer clusters. +Gathers kubernetes events, systemd logs, and service logs for infrastructure clusters. + +You can provide multiple --service-cluster and --mgmt-cluster flags. +Logs will be collected sequentially and stored in a single output folder.`, + Args: cobra.NoArgs, + SilenceUsage: true, + SilenceErrors: true, + TraverseChildren: true, + RunE: func(cmd *cobra.Command, args []string) error { + return opts.Run(cmd.Context()) + }, + CompletionOptions: cobra.CompletionOptions{ + HiddenDefaultCmd: true, + }, + } + + if err := BindInfraQueryOptions(opts, cmd); err != nil { + return nil, err + } + + return cmd, nil +} + +func (opts *CompletedInfraQueryOptions) RunInfra(ctx context.Context) error { + logger := logr.FromContextOrDiscard(ctx) + defer func() { + if closeErr := opts.QueryClient.Close(); closeErr != nil { + logger.Error(closeErr, "Warning: failed to close Kusto client") + } + }() + + allErrors := []error{} + + for _, clusterName := range opts.ServiceClusters { + if err := runQuery(ctx, logger, opts.QueryClient, opts.OutputPath, clusterName, mustgather.InfraClusterTypeService, opts.TimestampMin, opts.TimestampMax, opts.Limit); err != nil { + allErrors = append(allErrors, err) + } + } + for _, clusterName := range opts.MgmtClusters { + if err := runQuery(ctx, logger, opts.QueryClient, opts.OutputPath, clusterName, mustgather.InfraClusterTypeManagement, opts.TimestampMin, opts.TimestampMax, opts.Limit); err != nil { + allErrors = append(allErrors, err) + } + } + + if len(allErrors) > 0 { + return fmt.Errorf("failed to gather infrastructure logs for some clusters: %w", allErrors) + } + + return nil +} + +func runQuery(ctx context.Context, logger logr.Logger, queryClient mustgather.QueryClientInterface, outputPath string, clusterName string, clusterType mustgather.InfraClusterType, timestampMin time.Time, timestampMax time.Time, limit int) error { + logger.V(1).Info("Gathering infrastructure logs", "cluster", clusterName) + + queryOptions, err := mustgather.NewInfraQueryOptions(clusterType, clusterName, timestampMin, timestampMax, limit) + if err != nil { + return fmt.Errorf("failed to create query options for cluster %s: %w", clusterName, err) + } + + gatherer := mustgather.NewCliGatherer(queryClient, outputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ + QueryOptions: queryOptions, + GatherInfraLogs: true, + }) + + if err := gatherer.GatherLogs(ctx); err != nil { + return fmt.Errorf("failed to gather infrastructure logs for cluster %s: %w", clusterName, err) + } + + return nil +} diff --git a/tooling/hcpctl/cmd/must-gather/query_infra_options.go b/tooling/hcpctl/cmd/must-gather/query_infra_options.go new file mode 100644 index 0000000000..4f0554ea05 --- /dev/null +++ b/tooling/hcpctl/cmd/must-gather/query_infra_options.go @@ -0,0 +1,112 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mustgather + +import ( + "context" + "fmt" + "net/url" + + "github.com/spf13/cobra" + + "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/mustgather" +) + +// RawInfraQueryOptions represents the initial, unvalidated configuration for infrastructure query operations. +type RawInfraQueryOptions struct { + BaseGatherOptions + ServiceClusters []string // Service cluster names + MgmtClusters []string // Management cluster names +} + +// DefaultInfraQueryOptions returns a new RawInfraQueryOptions struct initialized with sensible defaults. +func DefaultInfraQueryOptions() *RawInfraQueryOptions { + return &RawInfraQueryOptions{ + BaseGatherOptions: DefaultBaseGatherOptions(), + } +} + +// BindInfraQueryOptions configures cobra command flags for infrastructure query options. +func BindInfraQueryOptions(opts *RawInfraQueryOptions, cmd *cobra.Command) error { + if err := BindBaseGatherOptions(&opts.BaseGatherOptions, cmd); err != nil { + return err + } + + cmd.Flags().StringArrayVar(&opts.ServiceClusters, "service-cluster", opts.ServiceClusters, "service cluster name (can be specified multiple times)") + cmd.Flags().StringArrayVar(&opts.MgmtClusters, "mgmt-cluster", opts.MgmtClusters, "management cluster name (can be specified multiple times)") + + return nil +} + +// ValidatedInfraQueryOptions represents infrastructure query configuration that has passed validation. +type ValidatedInfraQueryOptions struct { + *RawInfraQueryOptions + + KustoEndpoint *url.URL +} + +// Validate performs validation of all infrastructure query input parameters. +func (o *RawInfraQueryOptions) Validate(ctx context.Context) (*ValidatedInfraQueryOptions, error) { + kustoEndpoint, err := validateBaseGatherOptions(&o.BaseGatherOptions) + if err != nil { + return nil, err + } + + if len(o.ServiceClusters) == 0 && len(o.MgmtClusters) == 0 { + return nil, fmt.Errorf("at least one --service-cluster or --mgmt-cluster is required") + } + + return &ValidatedInfraQueryOptions{ + RawInfraQueryOptions: o, + KustoEndpoint: kustoEndpoint, + }, nil +} + +// CompletedInfraQueryOptions represents the final, fully validated and initialized configuration for infrastructure query operations. +type CompletedInfraQueryOptions struct { + *ValidatedInfraQueryOptions + QueryClient mustgather.QueryClientInterface +} + +// Complete performs final initialization to create fully usable CompletedInfraQueryOptions. +func (o *ValidatedInfraQueryOptions) Complete(ctx context.Context) (*CompletedInfraQueryOptions, error) { + queryClient, err := completeBaseGatherOptions(o.KustoEndpoint, o.QueryTimeout, o.OutputPath) + if err != nil { + return nil, err + } + + if err := createOutputDirectories(o.OutputPath, true); err != nil { + return nil, err + } + + return &CompletedInfraQueryOptions{ + ValidatedInfraQueryOptions: o, + QueryClient: queryClient, + }, nil +} + +func (opts *RawInfraQueryOptions) Run(ctx context.Context) error { + validated, err := opts.Validate(ctx) + if err != nil { + return err + } + + completed, err := validated.Complete(ctx) + if err != nil { + return err + } + + return completed.RunInfra(ctx) +} diff --git a/tooling/hcpctl/cmd/must-gather/query_options.go b/tooling/hcpctl/cmd/must-gather/query_options.go index a880190cab..3e67970cfc 100644 --- a/tooling/hcpctl/cmd/must-gather/query_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_options.go @@ -18,87 +18,39 @@ import ( "context" "fmt" "net/url" - "os" - "path" - "time" "github.com/go-logr/logr" "github.com/spf13/cobra" - "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/kusto" "github.com/Azure/ARO-HCP/tooling/hcpctl/pkg/mustgather" ) -// RawMustGatherOptions represents the initial, unvalidated configuration for must-gather operations. -type RawMustGatherOptions struct { - Kusto string // Name of the Azure Data Explorer cluster - Region string // Region of the Azure Data Explorer cluster - OutputPath string // Path to write the output file - QueryTimeout time.Duration // Timeout for query execution - SubscriptionID string // Subscription ID - ResourceGroup string // Resource group - ResourceId string // Resource ID - SkipHostedControlPlaneLogs bool // Skip hosted control plane logs - TimestampMin time.Time // Timestamp minimum - TimestampMax time.Time // Timestamp maximum - Limit int // Limit the number of results - GatherInfraLogs bool // Gather all logs from the infrastructure, does NOT gather HCP logs - InfraClusterName string // Name of the infrastructure cluster +// RawQueryOptions represents the initial, unvalidated configuration for query operations. +type RawQueryOptions struct { + BaseGatherOptions + SubscriptionID string // Subscription ID + ResourceGroup string // Resource group + ResourceId string // Resource ID + SkipHostedControlPlaneLogs bool // Skip hosted control plane logs } -// DefaultMustGatherOptions returns a new RawMustGatherOptions struct initialized with sensible defaults. -func DefaultMustGatherOptions() *RawMustGatherOptions { - return &RawMustGatherOptions{ - QueryTimeout: 5 * time.Minute, - TimestampMin: time.Now().Add(-24 * time.Hour), - TimestampMax: time.Now(), - Limit: -1, // defaults to no limit - OutputPath: fmt.Sprintf("must-gather-%s", time.Now().Format("20060102-150405")), +// DefaultQueryOptions returns a new RawQueryOptions struct initialized with sensible defaults. +func DefaultQueryOptions() *RawQueryOptions { + return &RawQueryOptions{ + BaseGatherOptions: DefaultBaseGatherOptions(), } } -func (opts *RawMustGatherOptions) Run(ctx context.Context, runLegacy bool) error { - validated, err := opts.Validate(ctx) - if err != nil { +// BindQueryOptions configures cobra command flags for query specific options. +func BindQueryOptions(opts *RawQueryOptions, cmd *cobra.Command) error { + if err := BindBaseGatherOptions(&opts.BaseGatherOptions, cmd); err != nil { return err } - completed, err := validated.Complete(ctx) - if err != nil { - return err - } - - if runLegacy { - return completed.RunLegacy(ctx) - } - - return completed.Run(ctx) -} - -// BindMustGatherOptions configures cobra command flags for must-gather specific options. -func BindMustGatherOptions(opts *RawMustGatherOptions, cmd *cobra.Command) error { - // Add must-gather specific flags - cmd.Flags().StringVar(&opts.Kusto, "kusto", opts.Kusto, "Azure Data Explorer cluster name (required)") - cmd.Flags().StringVar(&opts.Region, "region", opts.Region, "Azure Data Explorer cluster region (required)") - cmd.Flags().DurationVar(&opts.QueryTimeout, "query-timeout", opts.QueryTimeout, "timeout for query execution") - cmd.Flags().StringVar(&opts.OutputPath, "output-path", opts.OutputPath, "path to write the output file") - cmd.Flags().BoolVar(&opts.GatherInfraLogs, "gather-infra-logs", false, "gather all logs from the infrastructure, does NOT gather HCP logs") - cmd.Flags().StringVar(&opts.InfraClusterName, "infra-cluster-name", opts.InfraClusterName, "name of the infrastructure cluster") cmd.Flags().StringVar(&opts.SubscriptionID, "subscription-id", opts.SubscriptionID, "subscription ID") cmd.Flags().StringVar(&opts.ResourceGroup, "resource-group", opts.ResourceGroup, "resource group") cmd.Flags().StringVar(&opts.ResourceId, "resource-id", opts.ResourceId, "resource ID") cmd.Flags().BoolVar(&opts.SkipHostedControlPlaneLogs, "skip-hcp-logs", opts.SkipHostedControlPlaneLogs, "Do not gather customer (ocm namespaces) logs") - cmd.Flags().TimeVar(&opts.TimestampMin, "timestamp-min", opts.TimestampMin, []string{time.DateTime}, "timestamp minimum") - cmd.Flags().TimeVar(&opts.TimestampMax, "timestamp-max", opts.TimestampMax, []string{time.DateTime}, "timestamp maximum") - cmd.Flags().IntVar(&opts.Limit, "limit", opts.Limit, "limit the number of results") - - // Mark required flags - requiredFlags := []string{"kusto", "region"} - for _, flag := range requiredFlags { - if err := cmd.MarkFlagRequired(flag); err != nil { - return fmt.Errorf("failed to mark %s as required: %w", flag, err) - } - } cmd.MarkFlagsMutuallyExclusive("subscription-id", "resource-id") cmd.MarkFlagsMutuallyExclusive("resource-group", "resource-id") @@ -108,63 +60,36 @@ func BindMustGatherOptions(opts *RawMustGatherOptions, cmd *cobra.Command) error return nil } -// ValidatedMustGatherOptions represents must-gather configuration that has passed validation. -type ValidatedMustGatherOptions struct { - *RawMustGatherOptions +// ValidatedQueryOptions represents query configuration that has passed validation. +type ValidatedQueryOptions struct { + *RawQueryOptions KustoEndpoint *url.URL QueryOptions mustgather.QueryOptions } -// Validate performs comprehensive validation of all must-gather input parameters. -func (o *RawMustGatherOptions) Validate(ctx context.Context) (*ValidatedMustGatherOptions, error) { +// Validate performs comprehensive validation of all query input parameters. +func (o *RawQueryOptions) Validate(ctx context.Context) (*ValidatedQueryOptions, error) { logger := logr.FromContextOrDiscard(ctx) - // Validate kusto name - if o.Kusto == "" { - return nil, fmt.Errorf("kusto is required") - } - // Validate region - if o.Region == "" { - return nil, fmt.Errorf("region is required") - } - - // form kusto URL - kustoEndpoint, err := kusto.KustoEndpoint(o.Kusto, o.Region) + kustoEndpoint, err := validateBaseGatherOptions(&o.BaseGatherOptions) if err != nil { - return nil, fmt.Errorf("failed to create Kusto endpoint: %w", err) + return nil, err } - // Validate query timeout - if o.QueryTimeout < 30*time.Second { - return nil, fmt.Errorf("query timeout must be at least 30 seconds") - } - - if o.QueryTimeout > 30*time.Minute { - return nil, fmt.Errorf("query timeout cannot exceed 30 minutes") - } - - // Validate subscription ID - if o.SubscriptionID == "" && o.ResourceId == "" && o.InfraClusterName == "" { + if o.SubscriptionID == "" && o.ResourceId == "" { return nil, fmt.Errorf("subscription-id is required") } - - // Validate resource group - if o.ResourceGroup == "" && o.ResourceId == "" && o.InfraClusterName == "" { + if o.ResourceGroup == "" && o.ResourceId == "" { return nil, fmt.Errorf("resource-group is required") } - - if o.ResourceId != "" && (o.ResourceGroup != "" || o.SubscriptionID != "" || o.InfraClusterName != "") { - logger.Info("warning: both resource-id and resource-group/subscription-id/infra-cluster-name are provided, will use resource-id to gather cluster ID") - } - - if o.TimestampMin.After(o.TimestampMax) { - return nil, fmt.Errorf("timestamp-min cannot be after timestamp-max") + if o.ResourceId != "" && (o.ResourceGroup != "" || o.SubscriptionID != "") { + logger.Info("warning: both resource-id and resource-group/subscription-id are provided, will use resource-id to gather cluster ID") } - return &ValidatedMustGatherOptions{ - RawMustGatherOptions: o, - KustoEndpoint: kustoEndpoint, + return &ValidatedQueryOptions{ + RawQueryOptions: o, + KustoEndpoint: kustoEndpoint, QueryOptions: mustgather.QueryOptions{ SubscriptionId: o.SubscriptionID, ResourceGroupName: o.ResourceGroup, @@ -175,38 +100,43 @@ func (o *RawMustGatherOptions) Validate(ctx context.Context) (*ValidatedMustGath }, nil } -// Complete performs final initialization to create fully usable MustGatherOptions. -func (o *ValidatedMustGatherOptions) Complete(ctx context.Context) (*MustGatherOptions, error) { - client, err := kusto.NewClient(o.KustoEndpoint, o.QueryTimeout) +// CompletedQueryOptions represents the final, fully validated and initialized configuration for query operations. +type CompletedQueryOptions struct { + *ValidatedQueryOptions + QueryClient mustgather.QueryClientInterface +} + +// Complete performs final initialization to create fully usable CompletedQueryOptions. +func (o *ValidatedQueryOptions) Complete(ctx context.Context) (*CompletedQueryOptions, error) { + queryClient, err := completeBaseGatherOptions(o.KustoEndpoint, o.QueryTimeout, o.OutputPath) if err != nil { - return nil, fmt.Errorf("failed to create Kusto client: %w", err) + return nil, err } - err = os.MkdirAll(path.Join(o.OutputPath, ServicesLogDirectory), 0755) - if err != nil { - return nil, fmt.Errorf("failed to create service logs directory: %w", err) + if err := createOutputDirectories(o.OutputPath, o.SkipHostedControlPlaneLogs); err != nil { + return nil, err } - err = os.MkdirAll(path.Join(o.OutputPath, InfraLogDirectory), 0755) + return &CompletedQueryOptions{ + ValidatedQueryOptions: o, + QueryClient: queryClient, + }, nil +} + +func (opts *RawQueryOptions) Run(ctx context.Context, runLegacy bool) error { + validated, err := opts.Validate(ctx) if err != nil { - return nil, fmt.Errorf("failed to create infrastructure logs directory: %w", err) + return err } - if !o.SkipHostedControlPlaneLogs { - err = os.MkdirAll(path.Join(o.OutputPath, HostedControlPlaneLogDirectory), 0755) - if err != nil { - return nil, fmt.Errorf("failed to create customer logs directory: %w", err) - } + completed, err := validated.Complete(ctx) + if err != nil { + return err } - return &MustGatherOptions{ - ValidatedMustGatherOptions: o, - QueryClient: mustgather.NewQueryClient(client, o.QueryTimeout, o.OutputPath), - }, nil -} + if runLegacy { + return completed.RunLegacy(ctx) + } -// MustGatherOptions represents the final, fully validated and initialized configuration for must-gather operations. -type MustGatherOptions struct { - *ValidatedMustGatherOptions - QueryClient mustgather.QueryClientInterface + return completed.RunQuery(ctx) } diff --git a/tooling/hcpctl/pkg/mustgather/queries.go b/tooling/hcpctl/pkg/mustgather/queries.go index 6a731a1383..c5b6e6df38 100644 --- a/tooling/hcpctl/pkg/mustgather/queries.go +++ b/tooling/hcpctl/pkg/mustgather/queries.go @@ -33,6 +33,11 @@ const ( QueryTypeSystemdLogs QueryType = "systemd-logs" ) +type InfraClusterType string + +const InfraClusterTypeService InfraClusterType = "service" +const InfraClusterTypeManagement InfraClusterType = "management" + var servicesDatabase = "ServiceLogs" var hostedControlPlaneLogsDatabase = "HostedControlPlaneLogs" @@ -56,12 +61,23 @@ type QueryOptions struct { SubscriptionId string ResourceGroupName string InfraClusterName string + InfraClusterType InfraClusterType TimestampMin time.Time TimestampMax time.Time Limit int } -func NewQueryOptions(subscriptionID, resourceGroupName, resourceId, infraClusterName string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { +func NewInfraQueryOptions(infraClusterType InfraClusterType, infraClusterName string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { + return &QueryOptions{ + InfraClusterName: infraClusterName, + InfraClusterType: infraClusterType, + TimestampMin: timestampMin, + TimestampMax: timestampMax, + Limit: limit, + }, nil +} + +func NewQueryOptions(subscriptionID, resourceGroupName, resourceId string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { if resourceId != "" { res, err := azcorearm.ParseResourceID(resourceId) if err != nil { @@ -77,7 +93,6 @@ func NewQueryOptions(subscriptionID, resourceGroupName, resourceId, infraCluster TimestampMin: timestampMin, TimestampMax: timestampMax, Limit: limit, - InfraClusterName: infraClusterName, }, nil } diff --git a/tooling/hcpctl/pkg/mustgather/queries_test.go b/tooling/hcpctl/pkg/mustgather/queries_test.go index 02fa7c1c40..620c4d7671 100644 --- a/tooling/hcpctl/pkg/mustgather/queries_test.go +++ b/tooling/hcpctl/pkg/mustgather/queries_test.go @@ -26,19 +26,19 @@ func TestNewQueryOptions(t *testing.T) { now := time.Now() // With resource ID - opts, err := NewQueryOptions("", "", "/subscriptions/test-sub/resourceGroups/test-rg", "", now, now, 100) + opts, err := NewQueryOptions("", "", "/subscriptions/test-sub/resourceGroups/test-rg", now, now, 100) require.NoError(t, err) assert.Equal(t, "test-sub", opts.SubscriptionId) assert.Equal(t, "test-rg", opts.ResourceGroupName) // With subscription/resource group - opts, err = NewQueryOptions("sub", "rg", "", "", now, now, 100) + opts, err = NewQueryOptions("sub", "rg", "", now, now, 100) require.NoError(t, err) assert.Equal(t, "sub", opts.SubscriptionId) assert.Equal(t, "rg", opts.ResourceGroupName) // Invalid resource ID - _, err = NewQueryOptions("", "", "/invalid", "", now, now, 100) + _, err = NewQueryOptions("", "", "/invalid", now, now, 100) assert.Error(t, err) } From d0eeb47de9bc1e28370982dc1f2b5ba593f3322f Mon Sep 17 00:00:00 2001 From: Jan-Hendrik Boll Date: Wed, 25 Feb 2026 13:22:58 +0000 Subject: [PATCH 4/4] Collect events and systemd in regular query as well --- docs/sops/gather-logs.md | 94 ++----- test/util/verifiers/kusto.go | 3 +- tooling/hcpctl/README.md | 15 +- .../cmd/must-gather/query_base_options.go | 8 +- tooling/hcpctl/cmd/must-gather/query_cmd.go | 2 + .../hcpctl/cmd/must-gather/query_infra_cmd.go | 48 ++-- .../cmd/must-gather/query_infra_options.go | 12 +- .../hcpctl/cmd/must-gather/query_options.go | 6 +- tooling/hcpctl/pkg/kusto/client.go | 6 +- tooling/hcpctl/pkg/kusto/queries.go | 16 ++ tooling/hcpctl/pkg/mustgather/gather.go | 98 ++++++-- tooling/hcpctl/pkg/mustgather/gather_test.go | 233 ++++++++++++++++-- tooling/hcpctl/pkg/mustgather/queries.go | 20 +- 13 files changed, 387 insertions(+), 174 deletions(-) diff --git a/docs/sops/gather-logs.md b/docs/sops/gather-logs.md index ddaa4a20f5..7e70c5dbfa 100644 --- a/docs/sops/gather-logs.md +++ b/docs/sops/gather-logs.md @@ -10,54 +10,17 @@ The must-gather commands are designed to collect and process diagnostic data fro ### 0. query -The `query` command is supported in the Kusto instances owned by SLSRE, currently this can be used with dev and int clusters. Prod is work in progress. The difference is simply to use `must-gather query` instead of `must-gather legacy-query`, all the rest works the same. +The `query` command is supported in the Kusto instances owned by SLSRE. See this Link for an up to date list of clusters and URLs: [hcp/components-and-architecture/kusto](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-cloud-native-and-management-platform/control-plane-bburns/azure-red-hat-openshift/azure-redhat-openshift-team-doc/hcp/components-and-architecture/kusto) -### 1. legacy-query +### 1. query -The `legacy-query` command executes preconfigured queries against Azure Data Explorer clusters using the `akskubesystem` table. This is legacy, cause it uses the ARO Classic table schema and is planned to replace with HCP specific schema/cli in the future. - -*Important:*, when you want to gather data for integrated dev, use the `must-gather query` command instead. - -#### Purpose -- Execute default queries against Azure Data Explorer (Kusto) -- Collect service logs for ARO-HCP services and hosted control planes -- Generate structured output for analysis - -#### Required Parameters -- `--kusto`: Azure Data Explorer cluster name, [database list](https://eng.ms/docs/cloud-ai-platform/azure-core/azure-cloud-native-and-management-platform/control-plane-bburns/azure-red-hat-openshift/azure-redhat-openshift-team-doc/doc/monitoring/kusto/kusto-database-list) -- `--region`: Azure Data Explorer cluster region -- `--subscription-id`: Azure subscription ID -- `--resource-group`: Azure resource group name - -#### Optional Parameters -- `--output-path`: Path to write output files (default: auto-generated timestamp-based directory) -- `--query-timeout`: Query execution timeout (default: 5 minutes, range: 30 seconds to 30 minutes) -- `--skip-hcp-logs`: Skip hosted control plane logs collection -- `--timestamp-min`: Minimum timestamp for data collection (default: 24 hours ago) -- `--timestamp-max`: Maximum timestamp for data collection (default: current time) -- `--limit`: Limit number of results returned - - -#### Authentication Requirements - -The commands use standard Azure authentication. Users must authenticate using the Azure CLI before running the commands: - -```bash -# Authenticate with Azure -az login - -# Verify authentication -az account show - -# Set the correct subscription if needed -az account set --subscription "your-subscription-id" -``` +Use query to fetch data for a specific HCP. #### Usage Examples **Basic usage with required parameters:** ```bash -hcpctl must-gather legacy-query \ +hcpctl must-gather query \ --kusto my-kusto-cluster \ --region eastus \ --subscription-id 12345678-1234-1234-1234-123456789012 \ @@ -66,7 +29,7 @@ hcpctl must-gather legacy-query \ **With custom output path and time range:** ```bash -hcpctl must-gather legacy-query \ +hcpctl must-gather query \ --kusto my-kusto-cluster \ --region eastus \ --subscription-id 12345678-1234-1234-1234-123456789012 \ @@ -78,7 +41,7 @@ hcpctl must-gather legacy-query \ **Skip hosted control plane logs:** ```bash -hcpctl must-gather legacy-query \ +hcpctl must-gather query \ --kusto my-kusto-cluster \ --region eastus \ --subscription-id 12345678-1234-1234-1234-123456789012 \ @@ -88,7 +51,7 @@ hcpctl must-gather legacy-query \ **With custom timeout and result limit:** ```bash -hcpctl must-gather legacy-query \ +hcpctl must-gather query \ --kusto my-kusto-cluster \ --region eastus \ --subscription-id 12345678-1234-1234-1234-123456789012 \ @@ -97,19 +60,6 @@ hcpctl must-gather legacy-query \ --limit 1000 ``` -#### Output Structure -The command creates the following directory structure: -``` -/ -├── service/ # Service logs directory -│ ├── containerLogs.json -│ ├── frontendContainerLogs.json -│ └── backendContainerLogs.json -├── host-control-plane/ # Hosted control plane logs (if not skipped) -│ └── customerLogs.json -└── options.json # Query options used -``` - #### Handling large data Kusto has limits for what a query can return, in order to overcome these, you can check the `json` files created. These contain information on the datasize queried. You can then use the `limit` and `timestamp` parameters to reduce the number of log rows gathered. These filters are applied per query. @@ -126,15 +76,6 @@ The `clean` command processes must-gather data to remove sensitive information u The `must-gather-clean` binary is available from the [openshift/must-gather-clean releases](https://github.com/openshift/must-gather-clean/releases) page. -#### Required Parameters -- `--path-to-clean`: Path to the must-gather data to clean -- `--service-config-path`: Path to ARO-HCP Service Configuration file (points to `config` directory containing `config.yaml`) -- `--must-gather-clean-binary`: Path to the must-gather-clean binary -- `--cleaned-output-path`: Path where cleaned output will be written - -#### Optional Parameters -- `--clean-config-path`: Path to custom must-gather-clean configuration file - #### Usage Examples **Basic usage with required parameters:** @@ -156,17 +97,16 @@ hcpctl must-gather clean \ --clean-config-path ./custom-clean-config.json ``` -#### Default Clean Configuration -When no custom configuration is provided, the default config can be found here [default_config.json](https://github.com/Azure/ARO-HCP/blob/main/tooling/hcpctl/cmd/must-gather/default_config.json) - - -#### Process Flow -1. **Configuration Loading**: Loads default or custom must-gather-clean configuration -2. **Pattern Discovery**: Scans service configuration files for UUIDs and other sensitive patterns -3. **Configuration Extension**: Adds discovered patterns to the clean configuration -4. **Configuration Persistence**: Saves the final configuration to a temporary file -5. **Clean Execution**: Runs the must-gather-clean binary with the generated configuration -6. **Output Generation**: Creates cleaned output in the specified directory +### 3. query-infra +This command fetches all service logs for a given cluster. This can produce quite a lot of data and usually you should use the above `query` command instead. +#### Usage Examples +``` +hcpctl must-gather query-infra \ + --kusto hcp-dev-us-2 \ + --region eastus2 \ + --infra-cluster prow-j1231233-mgmt-1 \ + --infra-cluster prow-j3453453-svc +``` diff --git a/test/util/verifiers/kusto.go b/test/util/verifiers/kusto.go index bc8a79ca7b..62e6b5fe1d 100644 --- a/test/util/verifiers/kusto.go +++ b/test/util/verifiers/kusto.go @@ -78,7 +78,6 @@ func (v verifyMustGatherLogsImpl) Verify(ctx context.Context) error { v.config.SubscriptionID, v.config.ResourceGroup, "", // resourceId - "", // infraClusterName time.Now().Add(-24*time.Hour), // timestampMin time.Now(), // timestampMax -1, // limit: -1 means no truncation @@ -107,6 +106,8 @@ func (v verifyMustGatherLogsImpl) Verify(ctx context.Context) error { mustgather.RowOutputOptions{}, mustgather.GathererOptions{ SkipHostedControlPlaneLogs: false, + SkipKubernetesEventsLogs: true, + SkipSystemdLogs: true, QueryOptions: queryOptions, }, ) diff --git a/tooling/hcpctl/README.md b/tooling/hcpctl/README.md index 54419fefa3..37bb5df127 100644 --- a/tooling/hcpctl/README.md +++ b/tooling/hcpctl/README.md @@ -122,7 +122,13 @@ hcpctl hcp breakglass 12345678-1234-1234-1234-123456789abc --privileged ### Gather Managed cluster logs -This is the usual use case for must-gather kust. You can gather logs for a managed cluster from Kusto. You need to be logged into Azure to access Kusto. You need to set kusto and region to point to the Kusto instance containing the desired logs. +This is the usual use case for must-gather kusto. You can gather logs for a managed cluster from Kusto. You need to be logged into Azure to access Kusto. You need to set kusto and region to point to the Kusto instance containing the desired logs. + +What is gathered? + +- All service logs, that contain the subscription id and resourcegroup name or are in the cluster namespace (aka hcp logs) +- All Kubernetes events from the management and service cluster +- All Systemd logs from the management and service cluster ```bash hcpctl must-gather query --kusto $kusto --region $region --subscription-id $subscription_id --resource-group $resource_group @@ -148,13 +154,12 @@ If you want to gather all Kusto logs for a given infra cluster (servicecluster o ```bash hcpctl must-gather query-infra \ --kusto aroint --region eastus \ - --service-cluster $svc_cluster_name \ - --mgmt-cluster $mgmt_cluster_name \ + --infra-cluster $svc_cluster_name \ + --infra-cluster $mgmt_cluster_name \ --limit 10000 - ``` -You can provide multiple `service-cluster` parameters and multiple `mgmt-cluster`. Logs will be collected sequentially and stored in a single folder for all clusters provided. +You can provide multiple `infra-cluster` parameters. Logs will be collected sequentially and stored in a single folder for all clusters provided. ## TODO diff --git a/tooling/hcpctl/cmd/must-gather/query_base_options.go b/tooling/hcpctl/cmd/must-gather/query_base_options.go index 7765afb6e9..399de1c332 100644 --- a/tooling/hcpctl/cmd/must-gather/query_base_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_base_options.go @@ -107,12 +107,14 @@ func completeBaseGatherOptions(kustoEndpoint *url.URL, queryTimeout time.Duratio } // createOutputDirectories creates the output directory structure. -func createOutputDirectories(outputPath string, skipHCPDir bool) error { +func createOutputDirectories(outputPath string, skipHCPDir bool, skipKubernetesEventsDir bool, skipSystemdLogsDir bool) error { if err := os.MkdirAll(path.Join(outputPath, ServicesLogDirectory), 0755); err != nil { return fmt.Errorf("failed to create service logs directory: %w", err) } - if err := os.MkdirAll(path.Join(outputPath, InfraLogDirectory), 0755); err != nil { - return fmt.Errorf("failed to create infrastructure logs directory: %w", err) + if !skipKubernetesEventsDir || !skipSystemdLogsDir { + if err := os.MkdirAll(path.Join(outputPath, InfraLogDirectory), 0755); err != nil { + return fmt.Errorf("failed to create infrastructure logs directory: %w", err) + } } if !skipHCPDir { if err := os.MkdirAll(path.Join(outputPath, HostedControlPlaneLogDirectory), 0755); err != nil { diff --git a/tooling/hcpctl/cmd/must-gather/query_cmd.go b/tooling/hcpctl/cmd/must-gather/query_cmd.go index 6431a23e14..e522b02a0b 100644 --- a/tooling/hcpctl/cmd/must-gather/query_cmd.go +++ b/tooling/hcpctl/cmd/must-gather/query_cmd.go @@ -66,6 +66,8 @@ func (opts *CompletedQueryOptions) RunQuery(ctx context.Context) error { gatherer := mustgather.NewCliGatherer(opts.QueryClient, opts.OutputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ QueryOptions: queryOptions, SkipHostedControlPlaneLogs: opts.SkipHostedControlPlaneLogs, + SkipKubernetesEventsLogs: opts.SkipKubernetesEventsLogs, + SkipSystemdLogs: opts.SkipSystemdLogs, GatherInfraLogs: false, }) diff --git a/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go b/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go index 3f15de1a0f..97c05db48f 100644 --- a/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go +++ b/tooling/hcpctl/cmd/must-gather/query_infra_cmd.go @@ -16,8 +16,8 @@ package mustgather import ( "context" + "errors" "fmt" - "time" "github.com/go-logr/logr" "github.com/spf13/cobra" @@ -34,7 +34,7 @@ func newQueryInfraCommand() (*cobra.Command, error) { Long: `Execute preconfigured infrastructure queries against Azure Data Explorer clusters. Gathers kubernetes events, systemd logs, and service logs for infrastructure clusters. -You can provide multiple --service-cluster and --mgmt-cluster flags. +You can provide multiple --infra-cluster flags. Logs will be collected sequentially and stored in a single output folder.`, Args: cobra.NoArgs, SilenceUsage: true, @@ -65,39 +65,27 @@ func (opts *CompletedInfraQueryOptions) RunInfra(ctx context.Context) error { allErrors := []error{} - for _, clusterName := range opts.ServiceClusters { - if err := runQuery(ctx, logger, opts.QueryClient, opts.OutputPath, clusterName, mustgather.InfraClusterTypeService, opts.TimestampMin, opts.TimestampMax, opts.Limit); err != nil { - allErrors = append(allErrors, err) - } - } - for _, clusterName := range opts.MgmtClusters { - if err := runQuery(ctx, logger, opts.QueryClient, opts.OutputPath, clusterName, mustgather.InfraClusterTypeManagement, opts.TimestampMin, opts.TimestampMax, opts.Limit); err != nil { - allErrors = append(allErrors, err) - } - } - - if len(allErrors) > 0 { - return fmt.Errorf("failed to gather infrastructure logs for some clusters: %w", allErrors) - } + for _, clusterName := range opts.InfraClusters { + logger.V(1).Info("Gathering infrastructure logs", "cluster", clusterName) - return nil -} + queryOptions, err := mustgather.NewInfraQueryOptions(clusterName, opts.TimestampMin, opts.TimestampMax, opts.Limit) + if err != nil { + allErrors = append(allErrors, fmt.Errorf("failed to create query options for cluster %s: %w", clusterName, err)) + continue + } -func runQuery(ctx context.Context, logger logr.Logger, queryClient mustgather.QueryClientInterface, outputPath string, clusterName string, clusterType mustgather.InfraClusterType, timestampMin time.Time, timestampMax time.Time, limit int) error { - logger.V(1).Info("Gathering infrastructure logs", "cluster", clusterName) + gatherer := mustgather.NewCliGatherer(opts.QueryClient, opts.OutputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ + QueryOptions: queryOptions, + GatherInfraLogs: true, + }) - queryOptions, err := mustgather.NewInfraQueryOptions(clusterType, clusterName, timestampMin, timestampMax, limit) - if err != nil { - return fmt.Errorf("failed to create query options for cluster %s: %w", clusterName, err) + if err := gatherer.GatherLogs(ctx); err != nil { + allErrors = append(allErrors, fmt.Errorf("failed to gather infrastructure logs for cluster %s: %w", clusterName, err)) + } } - gatherer := mustgather.NewCliGatherer(queryClient, outputPath, ServicesLogDirectory, HostedControlPlaneLogDirectory, mustgather.GathererOptions{ - QueryOptions: queryOptions, - GatherInfraLogs: true, - }) - - if err := gatherer.GatherLogs(ctx); err != nil { - return fmt.Errorf("failed to gather infrastructure logs for cluster %s: %w", clusterName, err) + if len(allErrors) > 0 { + return fmt.Errorf("failed to gather infrastructure logs for some clusters: %w", errors.Join(allErrors...)) } return nil diff --git a/tooling/hcpctl/cmd/must-gather/query_infra_options.go b/tooling/hcpctl/cmd/must-gather/query_infra_options.go index 4f0554ea05..6bdbd72aab 100644 --- a/tooling/hcpctl/cmd/must-gather/query_infra_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_infra_options.go @@ -27,8 +27,7 @@ import ( // RawInfraQueryOptions represents the initial, unvalidated configuration for infrastructure query operations. type RawInfraQueryOptions struct { BaseGatherOptions - ServiceClusters []string // Service cluster names - MgmtClusters []string // Management cluster names + InfraClusters []string // Infrastructure cluster names } // DefaultInfraQueryOptions returns a new RawInfraQueryOptions struct initialized with sensible defaults. @@ -44,8 +43,7 @@ func BindInfraQueryOptions(opts *RawInfraQueryOptions, cmd *cobra.Command) error return err } - cmd.Flags().StringArrayVar(&opts.ServiceClusters, "service-cluster", opts.ServiceClusters, "service cluster name (can be specified multiple times)") - cmd.Flags().StringArrayVar(&opts.MgmtClusters, "mgmt-cluster", opts.MgmtClusters, "management cluster name (can be specified multiple times)") + cmd.Flags().StringArrayVar(&opts.InfraClusters, "infra-cluster", opts.InfraClusters, "infrastructure cluster name (can be specified multiple times)") return nil } @@ -64,8 +62,8 @@ func (o *RawInfraQueryOptions) Validate(ctx context.Context) (*ValidatedInfraQue return nil, err } - if len(o.ServiceClusters) == 0 && len(o.MgmtClusters) == 0 { - return nil, fmt.Errorf("at least one --service-cluster or --mgmt-cluster is required") + if len(o.InfraClusters) == 0 { + return nil, fmt.Errorf("at least one --infra-cluster is required") } return &ValidatedInfraQueryOptions{ @@ -87,7 +85,7 @@ func (o *ValidatedInfraQueryOptions) Complete(ctx context.Context) (*CompletedIn return nil, err } - if err := createOutputDirectories(o.OutputPath, true); err != nil { + if err := createOutputDirectories(o.OutputPath, true, false, false); err != nil { return nil, err } diff --git a/tooling/hcpctl/cmd/must-gather/query_options.go b/tooling/hcpctl/cmd/must-gather/query_options.go index 3e67970cfc..a8a27e4e8d 100644 --- a/tooling/hcpctl/cmd/must-gather/query_options.go +++ b/tooling/hcpctl/cmd/must-gather/query_options.go @@ -32,6 +32,8 @@ type RawQueryOptions struct { ResourceGroup string // Resource group ResourceId string // Resource ID SkipHostedControlPlaneLogs bool // Skip hosted control plane logs + SkipKubernetesEventsLogs bool // Skip Kubernetes events logs + SkipSystemdLogs bool // Skip Systemd logs } // DefaultQueryOptions returns a new RawQueryOptions struct initialized with sensible defaults. @@ -51,6 +53,8 @@ func BindQueryOptions(opts *RawQueryOptions, cmd *cobra.Command) error { cmd.Flags().StringVar(&opts.ResourceGroup, "resource-group", opts.ResourceGroup, "resource group") cmd.Flags().StringVar(&opts.ResourceId, "resource-id", opts.ResourceId, "resource ID") cmd.Flags().BoolVar(&opts.SkipHostedControlPlaneLogs, "skip-hcp-logs", opts.SkipHostedControlPlaneLogs, "Do not gather customer (ocm namespaces) logs") + cmd.Flags().BoolVar(&opts.SkipKubernetesEventsLogs, "skip-kubernetes-events-logs", opts.SkipKubernetesEventsLogs, "Do not gather Kubernetes events logs") + cmd.Flags().BoolVar(&opts.SkipSystemdLogs, "skip-systemd-logs", opts.SkipSystemdLogs, "Do not gather Systemd logs") cmd.MarkFlagsMutuallyExclusive("subscription-id", "resource-id") cmd.MarkFlagsMutuallyExclusive("resource-group", "resource-id") @@ -113,7 +117,7 @@ func (o *ValidatedQueryOptions) Complete(ctx context.Context) (*CompletedQueryOp return nil, err } - if err := createOutputDirectories(o.OutputPath, o.SkipHostedControlPlaneLogs); err != nil { + if err := createOutputDirectories(o.OutputPath, o.SkipHostedControlPlaneLogs, o.SkipKubernetesEventsLogs, o.SkipSystemdLogs); err != nil { return nil, err } diff --git a/tooling/hcpctl/pkg/kusto/client.go b/tooling/hcpctl/pkg/kusto/client.go index b64362a5ee..c0fae7f9c3 100644 --- a/tooling/hcpctl/pkg/kusto/client.go +++ b/tooling/hcpctl/pkg/kusto/client.go @@ -121,7 +121,7 @@ func (c *Client) ExecutePreconfiguredQuery(ctx context.Context, query *Configura return nil, fmt.Errorf("primary result is nil") } - columsSet := false + columnsSet := false for row := range primaryResult.Table().Rows() { logger.V(8).Info("Processing row", "rowNumber", totalRows) row := row.Row() @@ -131,9 +131,9 @@ func (c *Client) ExecutePreconfiguredQuery(ctx context.Context, query *Configura } continue } - if !columsSet && row.Columns() != nil { + if !columnsSet && row.Columns() != nil { columns = row.Columns() - columsSet = true + columnsSet = true } select { case <-ctx.Done(): diff --git a/tooling/hcpctl/pkg/kusto/queries.go b/tooling/hcpctl/pkg/kusto/queries.go index 6ca13e7a97..bfb61f307f 100644 --- a/tooling/hcpctl/pkg/kusto/queries.go +++ b/tooling/hcpctl/pkg/kusto/queries.go @@ -35,3 +35,19 @@ func NewClusterIdQuery(database, clusterServiceLogsTable, subscriptionId, resour Parameters: parameters, } } + +func NewClusterNamesQuery(database, containerLogsTable, subscriptionId, resourceGroup string) *ConfigurableQuery { + builder := kql.New("").AddTable(containerLogsTable) + builder.AddLiteral("\n| where log has subResourceGroupId") + builder.AddLiteral("\n| distinct cluster") + + parameters := kql.NewParameters() + parameters.AddString("subResourceGroupId", fmt.Sprintf("/subscriptions/%s/resourceGroups/%s", subscriptionId, resourceGroup)) + + return &ConfigurableQuery{ + Name: "Cluster Names", + Database: database, + Query: builder, + Parameters: parameters, + } +} diff --git a/tooling/hcpctl/pkg/mustgather/gather.go b/tooling/hcpctl/pkg/mustgather/gather.go index d63541cf58..cb5269007c 100644 --- a/tooling/hcpctl/pkg/mustgather/gather.go +++ b/tooling/hcpctl/pkg/mustgather/gather.go @@ -86,6 +86,8 @@ type NormalizedLogLine struct { type GathererOptions struct { GatherInfraLogs bool // Gather all logs from the infrastructure, does NOT gather HCP logs SkipHostedControlPlaneLogs bool // Skip hosted control plane logs + SkipKubernetesEventsLogs bool // Skip Kubernetes events logs + SkipSystemdLogs bool // Skip Systemd logs QueryOptions *QueryOptions // Query options } @@ -294,8 +296,6 @@ func cliOutputFunc(ctx context.Context, logLineChan chan *NormalizedLogLine, que } } } - - return allErrors } func (g *Gatherer) GatherLogs(ctx context.Context) error { @@ -305,17 +305,23 @@ func (g *Gatherer) GatherLogs(ctx context.Context) error { return g.gatherInfraLogs(ctx) } + var gatherErrors error + // First, get all cluster IDs - clusterIds, err := g.executeClusterIdQuery(ctx, g.opts.QueryOptions.GetClusterIdQuery()) + clusterIds := make([]string, 0) + allClusterIds, err := g.executeQueryAndConvert(ctx, g.opts.QueryOptions.GetClusterIdQuery(), ClusterIdRow{}) if err != nil { return fmt.Errorf("failed to execute cluster id query: %w", err) } + for _, row := range allClusterIds { + clusterIds = append(clusterIds, row.(ClusterIdRow).ClusterId) + } logger.V(1).Info("Obtained following clusterIDs", "clusterIds", strings.Join(clusterIds, ", ")) g.opts.QueryOptions.ClusterIds = clusterIds // Gather service logs if err := g.queryAndWriteToFile(ctx, QueryTypeServices, g.opts.QueryOptions.GetServicesQueries()); err != nil { - return fmt.Errorf("failed to execute services query: %w", err) + gatherErrors = errors.Join(gatherErrors, fmt.Errorf("failed to execute services query: %w", err)) } // Gather hosted control plane logs if not skipped @@ -324,11 +330,53 @@ func (g *Gatherer) GatherLogs(ctx context.Context) error { } else { logger.V(1).Info("Executing hosted control plane logs") if err := g.queryAndWriteToFile(ctx, QueryTypeHostedControlPlane, g.opts.QueryOptions.GetHostedControlPlaneLogsQuery()); err != nil { - return fmt.Errorf("failed to execute hosted control plane logs query: %w", err) + gatherErrors = errors.Join(gatherErrors, fmt.Errorf("failed to execute hosted control plane logs query: %w", err)) } } - return nil + // Gather cluster names + if g.opts.SkipKubernetesEventsLogs && g.opts.SkipSystemdLogs { + logger.V(1).Info("Skipping Kubernetes events and Systemd logs") + return nil + } + + clusterNames := make([]string, 0) + for _, nameQuery := range g.opts.QueryOptions.GetClusterNamesQueries() { + allClusterNames, err := g.executeQueryAndConvert(ctx, nameQuery, ClusterNameRow{}) + if err != nil { + gatherErrors = errors.Join(gatherErrors, fmt.Errorf("failed to execute cluster names query: %w", err)) + } + for _, row := range allClusterNames { + clusterNames = append(clusterNames, row.(ClusterNameRow).ClusterName) + } + } + logger.V(1).Info("Obtained following clusterNames", "clusterNames", strings.Join(clusterNames, ", ")) + + if !g.opts.SkipKubernetesEventsLogs { + allKubernetesEventsQueries := make([]*kusto.ConfigurableQuery, 0) + for _, clusterName := range clusterNames { + opts := *g.opts.QueryOptions + opts.InfraClusterName = clusterName + allKubernetesEventsQueries = append(allKubernetesEventsQueries, opts.GetInfraKubernetesEventsQuery()...) + } + if err := g.queryAndWriteToFile(ctx, QueryTypeKubernetesEvents, allKubernetesEventsQueries); err != nil { + gatherErrors = errors.Join(gatherErrors, fmt.Errorf("failed to execute kubernetes events query: %w", err)) + } + } + + if !g.opts.SkipSystemdLogs { + allSystemdLogsQueries := make([]*kusto.ConfigurableQuery, 0) + for _, clusterName := range clusterNames { + opts := *g.opts.QueryOptions + opts.InfraClusterName = clusterName + allSystemdLogsQueries = append(allSystemdLogsQueries, opts.GetInfraSystemdLogsQuery()...) + } + if err := g.queryAndWriteToFile(ctx, QueryTypeSystemdLogs, allSystemdLogsQueries); err != nil { + gatherErrors = errors.Join(gatherErrors, fmt.Errorf("failed to execute systemd logs query: %w", err)) + } + } + + return gatherErrors } func (g *Gatherer) gatherInfraLogs(ctx context.Context) error { @@ -344,35 +392,45 @@ func (g *Gatherer) gatherInfraLogs(ctx context.Context) error { return nil } -func (g *Gatherer) executeClusterIdQuery(ctx context.Context, query *kusto.ConfigurableQuery) ([]string, error) { +func (g *Gatherer) executeQueryAndConvert(ctx context.Context, query *kusto.ConfigurableQuery, targetRow any) ([]any, error) { outputChannel := make(chan azkquery.Row) - allClusterIds := make([]string, 0) + allRows := make([]any, 0) group := new(errgroup.Group) group.Go(func() error { for row := range outputChannel { - cidRow := &ClusterIdRow{} - if err := row.ToStruct(cidRow); err != nil { - return fmt.Errorf("failed to convert row to struct: %w", err) - } - if cidRow.ClusterId != "" { - allClusterIds = append(allClusterIds, cidRow.ClusterId) + switch targetRow.(type) { + case ClusterIdRow: + cidRow := targetRow.(ClusterIdRow) + if err := row.ToStruct(&cidRow); err != nil { + return fmt.Errorf("failed to convert row to struct: %w", err) + } + allRows = append(allRows, cidRow) + case ClusterNameRow: + clusterNameRow := targetRow.(ClusterNameRow) + if err := row.ToStruct(&clusterNameRow); err != nil { + return fmt.Errorf("failed to convert row to struct: %w", err) + } + allRows = append(allRows, clusterNameRow) + default: + return fmt.Errorf("unsupported target row type: %T", targetRow) } } return nil }) - _, err := g.QueryClient.ExecutePreconfiguredQuery(ctx, query, outputChannel) - if err != nil { - return nil, fmt.Errorf("failed to execute query: %w", err) - } + _, queryErr := g.QueryClient.ExecutePreconfiguredQuery(ctx, query, outputChannel) close(outputChannel) if err := group.Wait(); err != nil { - return nil, fmt.Errorf("failed to execute query: %w", err) + return nil, fmt.Errorf("failed to process query results: %w", err) + } + + if queryErr != nil { + return nil, fmt.Errorf("failed to execute query: %w", queryErr) } - return allClusterIds, nil + return allRows, nil } func (g *Gatherer) queryAndWriteToFile(ctx context.Context, queryType QueryType, queries []*kusto.ConfigurableQuery) error { diff --git a/tooling/hcpctl/pkg/mustgather/gather_test.go b/tooling/hcpctl/pkg/mustgather/gather_test.go index ae64e3403e..40efadda95 100644 --- a/tooling/hcpctl/pkg/mustgather/gather_test.go +++ b/tooling/hcpctl/pkg/mustgather/gather_test.go @@ -93,6 +93,8 @@ func TestGatherer_GatherLogs(t *testing.T) { gatherer := &Gatherer{ QueryClient: mockQueryClient, opts: GathererOptions{ + SkipKubernetesEventsLogs: true, + SkipSystemdLogs: true, QueryOptions: &QueryOptions{ SubscriptionId: "test-sub", ResourceGroupName: "test-rg", @@ -102,14 +104,14 @@ func TestGatherer_GatherLogs(t *testing.T) { outputOptions: RowOutputOptions{"outputPath": "/test"}, } - // Success case + // Success case: cluster ID query + services + HCP queries mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(&kusto.QueryResult{}, nil).Once() mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil).Twice() err := gatherer.GatherLogs(t.Context()) assert.NoError(t, err) - // Error case + // Error case: cluster ID query fails mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(nil, errors.New("query failed")) err = gatherer.GatherLogs(t.Context()) @@ -118,6 +120,138 @@ func TestGatherer_GatherLogs(t *testing.T) { mockQueryClient.AssertExpectations(t) } +func TestGatherer_GatherLogs_WithKubernetesEventsAndSystemdLogs(t *testing.T) { + mockQueryClient := &MockQueryClient{} + gatherer := &Gatherer{ + QueryClient: mockQueryClient, + opts: GathererOptions{ + SkipKubernetesEventsLogs: false, + SkipSystemdLogs: false, + QueryOptions: &QueryOptions{ + SubscriptionId: "test-sub", + ResourceGroupName: "test-rg", + }, + }, + outputFunc: mockOutputFunc, + outputOptions: RowOutputOptions{"outputPath": "/test"}, + } + + // 1x ExecutePreconfiguredQuery for cluster IDs + mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(&kusto.QueryResult{}, nil) + // 2x ConcurrentQueries for services + HCP, + // 2x ConcurrentQueries for kubernetes events + systemd logs (empty queries since no cluster names) + mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil) + + err := gatherer.GatherLogs(t.Context()) + assert.NoError(t, err) + + mockQueryClient.AssertExpectations(t) +} + +func TestGatherer_GatherLogs_SkipOnlySystemdLogs(t *testing.T) { + mockQueryClient := &MockQueryClient{} + gatherer := &Gatherer{ + QueryClient: mockQueryClient, + opts: GathererOptions{ + SkipKubernetesEventsLogs: false, + SkipSystemdLogs: true, + QueryOptions: &QueryOptions{ + SubscriptionId: "test-sub", + ResourceGroupName: "test-rg", + }, + }, + outputFunc: mockOutputFunc, + outputOptions: RowOutputOptions{"outputPath": "/test"}, + } + + // Cluster ID + cluster name queries + mockQueryClient.On("ExecutePreconfiguredQuery", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Return(&kusto.QueryResult{}, nil) + // Services + HCP + kubernetes events (no systemd) + mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil) + + err := gatherer.GatherLogs(t.Context()) + assert.NoError(t, err) + + // Verify ConcurrentQueries was called 3 times (services + HCP + kubernetes events, no systemd) + mockQueryClient.AssertNumberOfCalls(t, "ConcurrentQueries", 3) +} + +func TestGatherer_GatherInfraLogs(t *testing.T) { + mockQueryClient := &MockQueryClient{} + gatherer := &Gatherer{ + QueryClient: mockQueryClient, + opts: GathererOptions{ + GatherInfraLogs: true, + QueryOptions: &QueryOptions{ + InfraClusterName: "test-infra-cluster", + }, + }, + outputFunc: mockOutputFunc, + outputOptions: RowOutputOptions{"outputPath": "/test"}, + infraLogsOnly: true, + } + + // 3x ConcurrentQueries: kubernetes events + systemd logs + services + mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(nil).Times(3) + + err := gatherer.GatherLogs(t.Context()) + assert.NoError(t, err) + + mockQueryClient.AssertExpectations(t) +} + +func TestGatherer_GatherInfraLogs_Error(t *testing.T) { + mockQueryClient := &MockQueryClient{} + gatherer := &Gatherer{ + QueryClient: mockQueryClient, + opts: GathererOptions{ + GatherInfraLogs: true, + QueryOptions: &QueryOptions{ + InfraClusterName: "test-infra-cluster", + }, + }, + outputFunc: mockOutputFunc, + outputOptions: RowOutputOptions{"outputPath": "/test"}, + infraLogsOnly: true, + } + + // First ConcurrentQueries (kubernetes events) fails + mockQueryClient.On("ConcurrentQueries", mock.AnythingOfType("*context.cancelCtx"), mock.AnythingOfType("[]*kusto.ConfigurableQuery"), mock.Anything).Return(errors.New("query failed")).Once() + + err := gatherer.GatherLogs(t.Context()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "kubernetes events") + + mockQueryClient.AssertExpectations(t) +} + +func TestGatherer_GatherLogs_ContextCancellation(t *testing.T) { + mockQueryClient := &MockQueryClient{} + ctx, cancel := context.WithCancel(t.Context()) + + gatherer := &Gatherer{ + QueryClient: mockQueryClient, + opts: GathererOptions{ + SkipKubernetesEventsLogs: true, + SkipSystemdLogs: true, + QueryOptions: &QueryOptions{ + SubscriptionId: "test-sub", + ResourceGroupName: "test-rg", + }, + }, + outputFunc: mockOutputFunc, + outputOptions: RowOutputOptions{"outputPath": "/test"}, + } + + // Cancel context before the query can complete + mockQueryClient.On("ExecutePreconfiguredQuery", mock.Anything, mock.AnythingOfType("*kusto.ConfigurableQuery"), mock.Anything).Run(func(args mock.Arguments) { + cancel() + }).Return(nil, context.Canceled).Once() + + err := gatherer.GatherLogs(ctx) + assert.Error(t, err) +} + func TestCliOutputFunc(t *testing.T) { // Success case tempDir, err := os.MkdirTemp("", "test-gatherer-*") @@ -133,16 +267,14 @@ func TestCliOutputFunc(t *testing.T) { string(QueryTypeServices): "services", } - go func() { - logLineChan <- &NormalizedLogLine{ - Log: []byte("test log"), - Cluster: "cluster1", - Namespace: "default", - ContainerName: "container1", - Timestamp: time.Now(), - } - close(logLineChan) - }() + logLineChan <- &NormalizedLogLine{ + Log: []byte("test log"), + Cluster: "cluster1", + Namespace: "default", + ContainerName: "container1", + Timestamp: time.Now(), + } + close(logLineChan) err = cliOutputFunc(t.Context(), logLineChan, QueryTypeServices, options) assert.NoError(t, err) @@ -161,17 +293,80 @@ func TestCliOutputFunc(t *testing.T) { string(QueryTypeServices): "services", } + logLineChan <- &NormalizedLogLine{ + Log: []byte("test log"), + Cluster: "cluster1", + Namespace: "default", + ContainerName: "container1", + Timestamp: time.Now(), + } + close(logLineChan) + + err = cliOutputFunc(t.Context(), logLineChan, QueryTypeServices, badOptions) + assert.Error(t, err) +} + +func TestCliOutputFunc_KubernetesEvents(t *testing.T) { + tempDir, err := os.MkdirTemp("", "test-gatherer-events-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + err = os.MkdirAll(filepath.Join(tempDir, "cluster"), 0755) + require.NoError(t, err) + + logLineChan := make(chan *NormalizedLogLine, 1) + options := RowOutputOptions{ + "outputPath": tempDir, + } + go func() { logLineChan <- &NormalizedLogLine{ - Log: []byte("test log"), - Cluster: "cluster1", - Namespace: "default", - ContainerName: "container1", - Timestamp: time.Now(), + Log: []byte("event log line"), + Cluster: "test-cluster", + Timestamp: time.Now(), } close(logLineChan) }() - err = cliOutputFunc(t.Context(), logLineChan, QueryTypeServices, badOptions) - assert.Error(t, err) + err = cliOutputFunc(t.Context(), logLineChan, QueryTypeKubernetesEvents, options) + assert.NoError(t, err) + + // Kubernetes events use cluster-querytype.log naming + expectedFile := filepath.Join(tempDir, "cluster", "test-cluster-kubernetes-events.log") + assert.FileExists(t, expectedFile) + content, err := os.ReadFile(expectedFile) + require.NoError(t, err) + assert.Contains(t, string(content), "event log line") +} + +func TestCliOutputFunc_SystemdLogs(t *testing.T) { + tempDir, err := os.MkdirTemp("", "test-gatherer-systemd-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + err = os.MkdirAll(filepath.Join(tempDir, "cluster"), 0755) + require.NoError(t, err) + + logLineChan := make(chan *NormalizedLogLine, 1) + options := RowOutputOptions{ + "outputPath": tempDir, + } + + go func() { + logLineChan <- &NormalizedLogLine{ + Log: []byte("systemd log line"), + Cluster: "test-cluster", + Timestamp: time.Now(), + } + close(logLineChan) + }() + + err = cliOutputFunc(t.Context(), logLineChan, QueryTypeSystemdLogs, options) + assert.NoError(t, err) + + expectedFile := filepath.Join(tempDir, "cluster", "test-cluster-systemd-logs.log") + assert.FileExists(t, expectedFile) + content, err := os.ReadFile(expectedFile) + require.NoError(t, err) + assert.Contains(t, string(content), "systemd log line") } diff --git a/tooling/hcpctl/pkg/mustgather/queries.go b/tooling/hcpctl/pkg/mustgather/queries.go index c5b6e6df38..f6cdec3830 100644 --- a/tooling/hcpctl/pkg/mustgather/queries.go +++ b/tooling/hcpctl/pkg/mustgather/queries.go @@ -33,11 +33,6 @@ const ( QueryTypeSystemdLogs QueryType = "systemd-logs" ) -type InfraClusterType string - -const InfraClusterTypeService InfraClusterType = "service" -const InfraClusterTypeManagement InfraClusterType = "management" - var servicesDatabase = "ServiceLogs" var hostedControlPlaneLogsDatabase = "HostedControlPlaneLogs" @@ -56,21 +51,23 @@ type ClusterIdRow struct { ClusterId string `kusto:"cid"` } +type ClusterNameRow struct { + ClusterName string `kusto:"cluster"` +} + type QueryOptions struct { ClusterIds []string SubscriptionId string ResourceGroupName string InfraClusterName string - InfraClusterType InfraClusterType TimestampMin time.Time TimestampMax time.Time Limit int } -func NewInfraQueryOptions(infraClusterType InfraClusterType, infraClusterName string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { +func NewInfraQueryOptions(infraClusterName string, timestampMin, timestampMax time.Time, limit int) (*QueryOptions, error) { return &QueryOptions{ InfraClusterName: infraClusterName, - InfraClusterType: infraClusterType, TimestampMin: timestampMin, TimestampMax: timestampMax, Limit: limit, @@ -186,3 +183,10 @@ func (opts *QueryOptions) GetHostedControlPlaneLogsQuery() []*kusto.Configurable func (opts *QueryOptions) GetClusterIdQuery() *kusto.ConfigurableQuery { return kusto.NewClusterIdQuery(servicesDatabase, clustersServiceLogsTable, opts.SubscriptionId, opts.ResourceGroupName) } + +func (opts *QueryOptions) GetClusterNamesQueries() []*kusto.ConfigurableQuery { + return []*kusto.ConfigurableQuery{ + kusto.NewClusterNamesQuery(servicesDatabase, containerLogsTable, opts.SubscriptionId, opts.ResourceGroupName), + kusto.NewClusterNamesQuery(hostedControlPlaneLogsDatabase, containerLogsTable, opts.SubscriptionId, opts.ResourceGroupName), + } +}