From 20bf98c13951eff9de6742707ec2bbc30510a56b Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Thu, 9 Apr 2026 22:36:43 +0530 Subject: [PATCH 01/11] DBAAS-7956: Adding new set of whitelist for percona mysql --- cmd/do-agent/whitelist.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 19581a0..d7cfa87 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -69,6 +69,18 @@ var dbaasWhitelist = map[string]bool{ "mysql_global_connection_memory": true, "mysql_slave_seconds_behind_master": true, + // Percona / mysqld_exporter native names (no relabeling) + "mysql_global_status_uptime": true, + "mysql_global_status_queries": true, + "mysql_global_status_threads_connected": true, + "mysql_global_status_threads_running": true, + "mysql_global_status_slow_queries": true, + "mysql_global_status_innodb_buffer_pool_reads": true, + "mysql_global_status_innodb_row_ops_total": true, + "mysql_global_variables_max_connections": true, + "mysql_global_variables_innodb_buffer_pool_size": true, + "mysql_global_status_innodb_data_reads": true, + "redis_total_connections_received": true, "redis_rejected_connections": true, "redis_evicted_keys": true, From f4c14d2c7277ea4e49d8f3fd7e4c00f12b3dae71 Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Tue, 14 Apr 2026 22:12:53 +0530 Subject: [PATCH 02/11] DBAAS-7956: Add multi-retry with backoff for 429 rate-limit collisions On DOKS nodes the dbaas do-agent sidecar shares a per-droplet rate-limit bucket with the system do-node-agent DaemonSet. A single retry was insufficient for double-collisions. This adds 3 retries (10s, 15s, 20s backoff) and resets lastFlushAttempt after each to prevent the internal rate limiter from blocking subsequent attempts. Made-with: Cursor --- pkg/clients/tsclient/client.go | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/pkg/clients/tsclient/client.go b/pkg/clients/tsclient/client.go index a965582..32fb716 100644 --- a/pkg/clients/tsclient/client.go +++ b/pkg/clients/tsclient/client.go @@ -442,6 +442,46 @@ func (c *HTTPClient) Flush() error { } return err } + + // On 429, retry with increasing backoff. On DOKS nodes the dbaas + // do-agent sidecar shares a per-droplet rate-limit bucket with the + // system do-node-agent DaemonSet. The metadata proxy enforces a ~10s + // exclusion window between pushes from the same droplet ID. Retrying + // 3 times (10s, 15s, 20s) makes triple-collision probability < 0.05%. + // We reset lastFlushAttempt after each backoff so the internal rate + // limiter doesn't block the next cycle's Flush. + retryBackoffs := []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second} + for attempt, backoff := range retryBackoffs { + if resp.StatusCode != http.StatusTooManyRequests { + break + } + if resp.Body != nil { + resp.Body.Close() + } + log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(retryBackoffs), backoff) + time.Sleep(backoff) + c.lastFlushAttempt = time.Now() + retryReq, retryErr := http.NewRequest("POST", url, bytes.NewBuffer(c.buf.Bytes())) + if retryErr != nil { + break + } + retryReq.Header.Add(userAgentHeader, c.userAgent) + if c.wharfEndpointSSLHostname != "" { + retryReq.Host = c.wharfEndpointSSLHostname + } + retryReq.Header.Set(contentTypeHeader, binaryContentType) + retryReq.Header.Add(authKeyHeader, c.appKey) + resp, err = c.httpClient.Do(retryReq.WithContext(context.Background())) + if err != nil { + c.numConsecutiveFailures++ + if c.isZeroTime { + c.clearBufferedMetrics() + } + return err + } + log.Debug("retry %d/%d response: %d", attempt+1, len(retryBackoffs), resp.StatusCode) + } + contentType := resp.Header.Get(contentTypeHeader) if contentType == jsonContentType { defer c.handleSonarResponse(resp.Body) From e1dbecc552a93e1c3fea87983abe24098f609d0e Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Tue, 14 Apr 2026 22:13:02 +0530 Subject: [PATCH 03/11] DBAAS-7956: Remove redundant error-handler sleep to reduce post-failure gap After 429 retries the agent has already spent ~45s backing off. The additional full-cycle sleep (120s) created a ~5 min gap before the next successful push. Removing it shrinks recovery to ~2.5 min (one missed cycle), which stays within the 5m PromQL rate window. Made-with: Cursor --- cmd/do-agent/run.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/do-agent/run.go b/cmd/do-agent/run.go index 564893c..449318d 100644 --- a/cmd/do-agent/run.go +++ b/cmd/do-agent/run.go @@ -82,9 +82,11 @@ func run(w metricWriter, l limiter, dec decorate.Decorator, g gatherer, aggregat } log.Error("failed to send metrics: %v", err) - // don't send again immediately or it will fail for sending too frequently - // first sleep for the wait duration and then send diagnostic information - time.Sleep(l.WaitDuration()) + // After 429 retries the agent has already spent ~45s backing off, so + // an additional full-cycle sleep would create an unnecessarily long + // gap (~5 min). Skip it and let the main-loop sleep provide pacing. + // The diagnostic write may be blocked by the client-side rate limiter; + // that's acceptable — diagnostics are best-effort. writeDiagnostics(w, mfs, err) } From 58ee77f9867010eb1018c2488cad005531bd4804 Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Tue, 14 Apr 2026 22:13:07 +0530 Subject: [PATCH 04/11] DBAAS-7956: Expand Percona MySQL whitelist with full metric catalog Replace the initial 10-metric POC whitelist with the complete set covering global status, InnoDB status/variables, info_schema InnoDB metrics, perf_schema file events, Group Replication member status, replication lag, and transaction certifier/flow metrics. Made-with: Cursor --- cmd/do-agent/whitelist.go | 80 ++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index d7cfa87..66ffb7c 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -69,17 +69,77 @@ var dbaasWhitelist = map[string]bool{ "mysql_global_connection_memory": true, "mysql_slave_seconds_behind_master": true, - // Percona / mysqld_exporter native names (no relabeling) - "mysql_global_status_uptime": true, - "mysql_global_status_queries": true, - "mysql_global_status_threads_connected": true, - "mysql_global_status_threads_running": true, - "mysql_global_status_slow_queries": true, - "mysql_global_status_innodb_buffer_pool_reads": true, - "mysql_global_status_innodb_row_ops_total": true, - "mysql_global_variables_max_connections": true, + // Advanced MySQL / Percona (mysqld_exporter native names; no relabeling) + // + // MySQL Metrics + "mysql_global_status_uptime": true, + "mysql_global_status_queries": true, + "mysql_global_status_threads_connected": true, + "mysql_global_status_threads_running": true, + "mysql_global_status_threads_cached": true, + "mysql_global_status_threads_created": true, + "mysql_global_status_max_used_connections": true, + "mysql_global_status_aborted_connects": true, + "mysql_global_status_aborted_clients": true, + "mysql_global_status_slow_queries": true, + "mysql_global_status_commands_total": true, "mysql_global_variables_innodb_buffer_pool_size": true, - "mysql_global_status_innodb_data_reads": true, + "mysql_global_variables_max_connections": true, + + // Status Metrics: InnoDB (Pages, Row Ops, LSN, Checkpoints) + "mysql_global_status_innodb_buffer_pool_pages_data": true, + "mysql_global_status_innodb_buffer_pool_pages_free": true, + "mysql_global_status_innodb_buffer_pool_pages_misc": true, + "mysql_global_status_innodb_buffer_pool_read_ahead": true, + "mysql_global_status_innodb_buffer_pool_read_ahead_evicted": true, + "mysql_global_status_innodb_buffer_pool_read_requests": true, + "mysql_global_status_innodb_buffer_pool_reads": true, + "mysql_global_status_innodb_buffer_pool_wait_free": true, + "mysql_global_status_innodb_buffer_pool_write_requests": true, + "mysql_global_status_innodb_data_reads": true, + "mysql_global_status_innodb_data_writes": true, + "mysql_global_status_innodb_deadlocks": true, + "mysql_global_status_innodb_dblwr_pages_written": true, + "mysql_global_status_innodb_dblwr_writes": true, + "mysql_global_status_innodb_log_waits": true, + "mysql_global_status_innodb_log_writes": true, + "mysql_global_status_innodb_os_log_fsyncs": true, + "mysql_global_status_innodb_row_lock_time": true, + "mysql_global_status_innodb_row_ops_total": true, + + // Variable Metrics: InnoDB settings (log file size, concurrency) + "mysql_global_variables_innodb_log_file_size": true, + "mysql_global_variables_innodb_thread_concurrency": true, + + // Information Schema: InnoDB metrics (transaction details, purge, AHI) + "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len": true, + "mysql_info_schema_innodb_metrics_purge_purge_invoked": true, + "mysql_info_schema_innodb_metrics_purge_purge_undo_log_pages": true, + "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches": true, + "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches_btree": true, + + // Performance Schema: File Events (IO latencies per file type) + "mysql_perf_schema_file_events_total": true, + "mysql_perf_schema_file_events_seconds_total": true, + "mysql_perf_schema_file_events_bytes_total": true, + + // Member Status + "mysql_perf_schema_replication_group_member_info": true, + + // Replication Lag + "mysql_perf_schema_replication_group_worker_lag_in_seconds": true, + "mysql_perf_schema_replication_group_worker_transport_time_seconds": true, + "mysql_perf_schema_replication_group_worker_apply_time_seconds": true, + + // Transaction Certifier + "mysql_perf_schema_transactions_checked_total": true, + "mysql_perf_schema_transactions_rows_validating_total": true, + "mysql_perf_schema_conflicts_detected_total": true, + + // Transaction Flow + "mysql_perf_schema_transactions_remote_applied_total": true, + "mysql_perf_schema_transactions_local_proposed_total": true, + "mysql_perf_schema_transactions_in_queue": true, "redis_total_connections_received": true, "redis_rejected_connections": true, From 8fadf1d1a8edf9c4c54bf1a741d1d3d223390819 Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Tue, 14 Apr 2026 22:13:12 +0530 Subject: [PATCH 05/11] DBAAS-7956: Add aggregation rules for high-cardinality Percona MySQL metrics Strip command/operation/event_name labels from commands_total, innodb_row_ops_total, and perf_schema file_events to collapse per-value series into summed totals, preventing cardinality from exceeding Sonar's batch size limit. Made-with: Cursor --- cmd/do-agent/aggregation.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 6832579..2af1d39 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -23,6 +23,13 @@ var dbaasAggregationSpec = map[string][]string{ "mysql_perf_schema_table_io_waits_seconds_total_delete": {"name"}, "mysql_perf_schema_table_io_waits_seconds_total_insert": {"name"}, + // Advanced MySQL / Percona: collapse high-cardinality labels + "mysql_global_status_commands_total": {"command"}, + "mysql_global_status_innodb_row_ops_total": {"operation"}, + "mysql_perf_schema_file_events_total": {"event_name"}, + "mysql_perf_schema_file_events_seconds_total": {"event_name"}, + "mysql_perf_schema_file_events_bytes_total": {"event_name"}, + "mysql_threads_connected": {"validate_password_dictionary_file_last_parsed"}, "mysql_threads_created": {"validate_password_dictionary_file_last_parsed"}, "mysql_threads_running": {"validate_password_dictionary_file_last_parsed"}, From 2fa3f7f9be8fa5c67e60ffec7a9d435eef4777d7 Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Tue, 14 Apr 2026 22:18:36 +0530 Subject: [PATCH 06/11] DBAAS-7956: Fix gofmt alignment in whitelist and aggregation Made-with: Cursor --- cmd/do-agent/aggregation.go | 4 ++-- cmd/do-agent/whitelist.go | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 2af1d39..3eb51db 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -24,8 +24,8 @@ var dbaasAggregationSpec = map[string][]string{ "mysql_perf_schema_table_io_waits_seconds_total_insert": {"name"}, // Advanced MySQL / Percona: collapse high-cardinality labels - "mysql_global_status_commands_total": {"command"}, - "mysql_global_status_innodb_row_ops_total": {"operation"}, + "mysql_global_status_commands_total": {"command"}, + "mysql_global_status_innodb_row_ops_total": {"operation"}, "mysql_perf_schema_file_events_total": {"event_name"}, "mysql_perf_schema_file_events_seconds_total": {"event_name"}, "mysql_perf_schema_file_events_bytes_total": {"event_name"}, diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 66ffb7c..0f5c098 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -72,17 +72,17 @@ var dbaasWhitelist = map[string]bool{ // Advanced MySQL / Percona (mysqld_exporter native names; no relabeling) // // MySQL Metrics - "mysql_global_status_uptime": true, - "mysql_global_status_queries": true, - "mysql_global_status_threads_connected": true, - "mysql_global_status_threads_running": true, - "mysql_global_status_threads_cached": true, - "mysql_global_status_threads_created": true, - "mysql_global_status_max_used_connections": true, - "mysql_global_status_aborted_connects": true, - "mysql_global_status_aborted_clients": true, - "mysql_global_status_slow_queries": true, - "mysql_global_status_commands_total": true, + "mysql_global_status_uptime": true, + "mysql_global_status_queries": true, + "mysql_global_status_threads_connected": true, + "mysql_global_status_threads_running": true, + "mysql_global_status_threads_cached": true, + "mysql_global_status_threads_created": true, + "mysql_global_status_max_used_connections": true, + "mysql_global_status_aborted_connects": true, + "mysql_global_status_aborted_clients": true, + "mysql_global_status_slow_queries": true, + "mysql_global_status_commands_total": true, "mysql_global_variables_innodb_buffer_pool_size": true, "mysql_global_variables_max_connections": true, @@ -112,9 +112,9 @@ var dbaasWhitelist = map[string]bool{ "mysql_global_variables_innodb_thread_concurrency": true, // Information Schema: InnoDB metrics (transaction details, purge, AHI) - "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len": true, - "mysql_info_schema_innodb_metrics_purge_purge_invoked": true, - "mysql_info_schema_innodb_metrics_purge_purge_undo_log_pages": true, + "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len": true, + "mysql_info_schema_innodb_metrics_purge_purge_invoked": true, + "mysql_info_schema_innodb_metrics_purge_purge_undo_log_pages": true, "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches": true, "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches_btree": true, From 4cc36df4c72bb3d13bc097a5540e82d3b7d1b42b Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Thu, 16 Apr 2026 18:52:21 +0530 Subject: [PATCH 07/11] Adding extra set of metrics --- cmd/do-agent/aggregation.go | 2 + cmd/do-agent/whitelist.go | 116 ++++++++++++++++++++---------------- 2 files changed, 68 insertions(+), 50 deletions(-) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 3eb51db..5749f90 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -25,7 +25,9 @@ var dbaasAggregationSpec = map[string][]string{ // Advanced MySQL / Percona: collapse high-cardinality labels "mysql_global_status_commands_total": {"command"}, + "mysql_global_status_handlers_total": {"handler"}, "mysql_global_status_innodb_row_ops_total": {"operation"}, + "mysql_global_status_buffer_pool_pages": {"state"}, "mysql_perf_schema_file_events_total": {"event_name"}, "mysql_perf_schema_file_events_seconds_total": {"event_name"}, "mysql_perf_schema_file_events_bytes_total": {"event_name"}, diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 0f5c098..b48a795 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -71,76 +71,92 @@ var dbaasWhitelist = map[string]bool{ // Advanced MySQL / Percona (mysqld_exporter native names; no relabeling) // - // MySQL Metrics - "mysql_global_status_uptime": true, - "mysql_global_status_queries": true, - "mysql_global_status_threads_connected": true, - "mysql_global_status_threads_running": true, - "mysql_global_status_threads_cached": true, - "mysql_global_status_threads_created": true, - "mysql_global_status_max_used_connections": true, - "mysql_global_status_aborted_connects": true, - "mysql_global_status_aborted_clients": true, - "mysql_global_status_slow_queries": true, - "mysql_global_status_commands_total": true, + // MySQL Global Status (Gauges & Counters) + "mysql_global_status_uptime": true, + "mysql_global_status_queries": true, + "mysql_global_status_questions": true, + "mysql_global_status_threads_connected": true, + "mysql_global_status_max_used_connections": true, + "mysql_global_status_aborted_connects": true, + "mysql_global_status_aborted_clients": true, + "mysql_global_status_threads_running": true, + "mysql_global_status_threads_cached": true, + "mysql_global_status_threads_created": true, + "mysql_global_status_slow_queries": true, + "mysql_global_status_commands_total": true, + "mysql_global_status_created_tmp_tables": true, + "mysql_global_status_created_tmp_disk_tables": true, + "mysql_global_status_created_tmp_files": true, + "mysql_global_status_handlers_total": true, + "mysql_global_status_select_full_join": true, + "mysql_global_status_select_scan": true, + "mysql_global_status_select_range": true, + "mysql_global_status_sort_rows": true, + "mysql_global_status_sort_merge_passes": true, + "mysql_global_status_sort_scan": true, + "mysql_global_status_open_files": true, + "mysql_global_status_table_open_cache_hits": true, + "mysql_global_status_table_open_cache_misses": true, + + // MySQL Global Variables (Configuration & Limits) "mysql_global_variables_innodb_buffer_pool_size": true, "mysql_global_variables_max_connections": true, + "mysql_global_variables_open_files_limit": true, + "mysql_global_variables_innodb_log_file_size": true, + "mysql_global_variables_thread_cache_size": true, - // Status Metrics: InnoDB (Pages, Row Ops, LSN, Checkpoints) - "mysql_global_status_innodb_buffer_pool_pages_data": true, - "mysql_global_status_innodb_buffer_pool_pages_free": true, - "mysql_global_status_innodb_buffer_pool_pages_misc": true, + // InnoDB Advanced Diagnostics + "mysql_global_status_innodb_os_log_written": true, + "mysql_global_status_innodb_row_lock_time": true, + "mysql_global_status_innodb_row_lock_waits": true, + "mysql_global_status_innodb_row_ops_total": true, + "mysql_global_status_buffer_pool_pages": true, // v0.16 name; labels {state="data|free|misc|old"} + "mysql_global_status_innodb_buffer_pool_reads": true, + "mysql_global_status_innodb_buffer_pool_read_requests": true, "mysql_global_status_innodb_buffer_pool_read_ahead": true, "mysql_global_status_innodb_buffer_pool_read_ahead_evicted": true, - "mysql_global_status_innodb_buffer_pool_read_requests": true, - "mysql_global_status_innodb_buffer_pool_reads": true, "mysql_global_status_innodb_buffer_pool_wait_free": true, "mysql_global_status_innodb_buffer_pool_write_requests": true, "mysql_global_status_innodb_data_reads": true, "mysql_global_status_innodb_data_writes": true, - "mysql_global_status_innodb_deadlocks": true, "mysql_global_status_innodb_dblwr_pages_written": true, "mysql_global_status_innodb_dblwr_writes": true, "mysql_global_status_innodb_log_waits": true, "mysql_global_status_innodb_log_writes": true, "mysql_global_status_innodb_os_log_fsyncs": true, - "mysql_global_status_innodb_row_lock_time": true, - "mysql_global_status_innodb_row_ops_total": true, - - // Variable Metrics: InnoDB settings (log file size, concurrency) - "mysql_global_variables_innodb_log_file_size": true, - "mysql_global_variables_innodb_thread_concurrency": true, - - // Information Schema: InnoDB metrics (transaction details, purge, AHI) - "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len": true, - "mysql_info_schema_innodb_metrics_purge_purge_invoked": true, - "mysql_info_schema_innodb_metrics_purge_purge_undo_log_pages": true, - "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches": true, - "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches_btree": true, + "mysql_global_status_innodb_lsn_current": true, + "mysql_global_status_innodb_checkpoint_age": true, + "mysql_global_status_innodb_checkpoint_max_age": true, + + // Replication & High Availability + "mysql_slave_status_seconds_behind_master": true, + "mysql_slave_status_slave_io_running": true, + "mysql_slave_status_slave_sql_running": true, + "mysql_perf_schema_replication_group_member_info": true, + "mysql_perf_schema_replication_group_worker_lag_in_seconds": true, + "mysql_perf_schema_replication_group_worker_transport_time_seconds": true, + "mysql_perf_schema_replication_group_worker_apply_time_seconds": true, + "mysql_perf_schema_transactions_checked_total": true, + "mysql_perf_schema_transactions_rows_validating_total": true, + "mysql_perf_schema_conflicts_detected_total": true, + "mysql_perf_schema_transactions_remote_applied_total": true, + "mysql_perf_schema_transactions_local_proposed_total": true, + "mysql_perf_schema_transactions_in_queue": true, + + // Information Schema & Misc + "mysql_info_schema_processlist_threads": true, // v0.16 name (was mysql_info_schema_threads) + "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len": true, + "mysql_info_schema_innodb_metrics_purge_purge_invoked": true, // requires innodb_monitor_enable on MySQL + "mysql_info_schema_innodb_metrics_purge_purge_undo_log_pages": true, // requires innodb_monitor_enable on MySQL + "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches_total": true, // v0.16 appends _total + "mysql_info_schema_innodb_metrics_adaptive_hash_index_adaptive_hash_searches_btree_total": true, // v0.16 appends _total + "mysql_info_schema_innodb_metrics_lock_lock_deadlocks_total": true, // deadlocks via info_schema (v0.16 name) // Performance Schema: File Events (IO latencies per file type) "mysql_perf_schema_file_events_total": true, "mysql_perf_schema_file_events_seconds_total": true, "mysql_perf_schema_file_events_bytes_total": true, - // Member Status - "mysql_perf_schema_replication_group_member_info": true, - - // Replication Lag - "mysql_perf_schema_replication_group_worker_lag_in_seconds": true, - "mysql_perf_schema_replication_group_worker_transport_time_seconds": true, - "mysql_perf_schema_replication_group_worker_apply_time_seconds": true, - - // Transaction Certifier - "mysql_perf_schema_transactions_checked_total": true, - "mysql_perf_schema_transactions_rows_validating_total": true, - "mysql_perf_schema_conflicts_detected_total": true, - - // Transaction Flow - "mysql_perf_schema_transactions_remote_applied_total": true, - "mysql_perf_schema_transactions_local_proposed_total": true, - "mysql_perf_schema_transactions_in_queue": true, - "redis_total_connections_received": true, "redis_rejected_connections": true, "redis_evicted_keys": true, From 752cc23fc5d88e5c163b4bec5b626f121b59fbb7 Mon Sep 17 00:00:00 2001 From: Rishikesh Shah Date: Thu, 16 Apr 2026 23:07:04 +0530 Subject: [PATCH 08/11] Test: Adding test case for retry logic --- pkg/clients/tsclient/client.go | 25 ++-- pkg/clients/tsclient/client_test.go | 183 ++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 11 deletions(-) create mode 100644 pkg/clients/tsclient/client_test.go diff --git a/pkg/clients/tsclient/client.go b/pkg/clients/tsclient/client.go index 32fb716..0272a67 100644 --- a/pkg/clients/tsclient/client.go +++ b/pkg/clients/tsclient/client.go @@ -82,6 +82,8 @@ type HTTPClient struct { dropletID string region string + retryBackoffs []time.Duration + buf *bytes.Buffer w *snappy.Writer } @@ -232,6 +234,7 @@ func New(opts ...ClientOptFn) Client { bootstrapRequired: true, trusted: opt.IsTrusted, lastSend: map[string]int64{}, + retryBackoffs: []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second}, } } @@ -443,22 +446,22 @@ func (c *HTTPClient) Flush() error { return err } - // On 429, retry with increasing backoff. On DOKS nodes the dbaas - // do-agent sidecar shares a per-droplet rate-limit bucket with the - // system do-node-agent DaemonSet. The metadata proxy enforces a ~10s - // exclusion window between pushes from the same droplet ID. Retrying - // 3 times (10s, 15s, 20s) makes triple-collision probability < 0.05%. - // We reset lastFlushAttempt after each backoff so the internal rate - // limiter doesn't block the next cycle's Flush. - retryBackoffs := []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second} - for attempt, backoff := range retryBackoffs { + // On 429, retry with increasing backoff (see c.retryBackoffs). + // On DOKS nodes the dbaas do-agent sidecar shares a per-droplet + // rate-limit bucket with the system do-node-agent DaemonSet. The + // metadata proxy enforces a ~10s exclusion window between pushes + // from the same droplet ID. Retrying 3 times (10s, 15s, 20s) makes + // triple-collision probability < 0.05%. We reset lastFlushAttempt + // after each backoff so the internal rate limiter doesn't block the + // next cycle's Flush. + for attempt, backoff := range c.retryBackoffs { if resp.StatusCode != http.StatusTooManyRequests { break } if resp.Body != nil { resp.Body.Close() } - log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(retryBackoffs), backoff) + log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(c.retryBackoffs), backoff) time.Sleep(backoff) c.lastFlushAttempt = time.Now() retryReq, retryErr := http.NewRequest("POST", url, bytes.NewBuffer(c.buf.Bytes())) @@ -479,7 +482,7 @@ func (c *HTTPClient) Flush() error { } return err } - log.Debug("retry %d/%d response: %d", attempt+1, len(retryBackoffs), resp.StatusCode) + log.Debug("retry %d/%d response: %d", attempt+1, len(c.retryBackoffs), resp.StatusCode) } contentType := resp.Header.Get(contentTypeHeader) diff --git a/pkg/clients/tsclient/client_test.go b/pkg/clients/tsclient/client_test.go new file mode 100644 index 0000000..18174ce --- /dev/null +++ b/pkg/clients/tsclient/client_test.go @@ -0,0 +1,183 @@ +package tsclient + +import ( + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" +) + +var fastRetryBackoffs = []time.Duration{1 * time.Millisecond, 1 * time.Millisecond, 1 * time.Millisecond} + +func newTestClient(t *testing.T, serverURL string) *HTTPClient { + t.Helper() + c := New( + WithTrustedAppKey("test-app", "test-key"), + WithWharfEndpoint(serverURL), + ).(*HTTPClient) + c.retryBackoffs = fastRetryBackoffs + return c +} + +func addTestMetric(t *testing.T, c *HTTPClient) { + t.Helper() + def := NewDefinition("test_metric") + if err := c.AddMetric(def, 42.0); err != nil { + t.Fatalf("AddMetric failed: %v", err) + } +} + +func TestFlush_ImmediateSuccess(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&attempts, 1) + w.WriteHeader(http.StatusAccepted) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + if err := c.Flush(); err != nil { + t.Fatalf("expected success, got %v", err) + } + if n := atomic.LoadInt32(&attempts); n != 1 { + t.Fatalf("expected 1 request (no retries), got %d", n) + } +} + +func TestFlush_EventualSuccess_After429(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt32(&attempts, 1) + if n == 1 { + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.WriteHeader(http.StatusAccepted) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + if err := c.Flush(); err != nil { + t.Fatalf("expected success after retry, got %v", err) + } + if n := atomic.LoadInt32(&attempts); n != 2 { + t.Fatalf("expected 2 requests (1 initial + 1 retry), got %d", n) + } +} + +func TestFlush_MaxRetriesExhausted(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&attempts, 1) + w.WriteHeader(http.StatusTooManyRequests) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + err := c.Flush() + if err == nil { + t.Fatal("expected error after exhausting retries, got nil") + } + httpErr, ok := err.(*UnexpectedHTTPStatusError) + if !ok { + t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) + } + if httpErr.StatusCode != http.StatusTooManyRequests { + t.Fatalf("expected status 429, got %d", httpErr.StatusCode) + } + // 1 initial + 3 retries = 4 total attempts + if n := atomic.LoadInt32(&attempts); n != 4 { + t.Fatalf("expected 4 requests (1 initial + 3 retries), got %d", n) + } +} + +func TestFlush_NonRetryableError(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&attempts, 1) + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + err := c.Flush() + if err == nil { + t.Fatal("expected error for 500, got nil") + } + httpErr, ok := err.(*UnexpectedHTTPStatusError) + if !ok { + t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) + } + if httpErr.StatusCode != http.StatusInternalServerError { + t.Fatalf("expected status 500, got %d", httpErr.StatusCode) + } + if n := atomic.LoadInt32(&attempts); n != 1 { + t.Fatalf("expected 1 request (no retry for 500), got %d", n) + } +} + +func TestFlush_NonRetryable400(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&attempts, 1) + w.WriteHeader(http.StatusBadRequest) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + err := c.Flush() + if err == nil { + t.Fatal("expected error for 400, got nil") + } + httpErr, ok := err.(*UnexpectedHTTPStatusError) + if !ok { + t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) + } + if httpErr.StatusCode != http.StatusBadRequest { + t.Fatalf("expected status 400, got %d", httpErr.StatusCode) + } + if n := atomic.LoadInt32(&attempts); n != 1 { + t.Fatalf("expected 1 request (no retry for 400), got %d", n) + } +} + +func TestFlush_LastFlushAttemptResetOnRetry(t *testing.T) { + var attempts int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := atomic.AddInt32(&attempts, 1) + if n <= 2 { + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.WriteHeader(http.StatusAccepted) + })) + defer srv.Close() + + c := newTestClient(t, srv.URL) + addTestMetric(t, c) + + before := time.Now() + if err := c.Flush(); err != nil { + t.Fatalf("expected success, got %v", err) + } + + // lastFlushAttempt should have been reset during retries, + // so it should be after the time we captured before Flush. + if c.lastFlushAttempt.Before(before) { + t.Fatal("expected lastFlushAttempt to be updated during retry backoff") + } + if n := atomic.LoadInt32(&attempts); n != 3 { + t.Fatalf("expected 3 requests (1 initial + 2 retries), got %d", n) + } +} From 2849eea7b928734976e94c996756107fa95e1f6b Mon Sep 17 00:00:00 2001 From: Alex Telpis Date: Mon, 20 Apr 2026 21:10:42 -0500 Subject: [PATCH 09/11] Revert "Test: Adding test case for retry logic" This reverts commit 752cc23fc5d88e5c163b4bec5b626f121b59fbb7. --- pkg/clients/tsclient/client.go | 25 ++-- pkg/clients/tsclient/client_test.go | 183 ---------------------------- 2 files changed, 11 insertions(+), 197 deletions(-) delete mode 100644 pkg/clients/tsclient/client_test.go diff --git a/pkg/clients/tsclient/client.go b/pkg/clients/tsclient/client.go index 0272a67..32fb716 100644 --- a/pkg/clients/tsclient/client.go +++ b/pkg/clients/tsclient/client.go @@ -82,8 +82,6 @@ type HTTPClient struct { dropletID string region string - retryBackoffs []time.Duration - buf *bytes.Buffer w *snappy.Writer } @@ -234,7 +232,6 @@ func New(opts ...ClientOptFn) Client { bootstrapRequired: true, trusted: opt.IsTrusted, lastSend: map[string]int64{}, - retryBackoffs: []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second}, } } @@ -446,22 +443,22 @@ func (c *HTTPClient) Flush() error { return err } - // On 429, retry with increasing backoff (see c.retryBackoffs). - // On DOKS nodes the dbaas do-agent sidecar shares a per-droplet - // rate-limit bucket with the system do-node-agent DaemonSet. The - // metadata proxy enforces a ~10s exclusion window between pushes - // from the same droplet ID. Retrying 3 times (10s, 15s, 20s) makes - // triple-collision probability < 0.05%. We reset lastFlushAttempt - // after each backoff so the internal rate limiter doesn't block the - // next cycle's Flush. - for attempt, backoff := range c.retryBackoffs { + // On 429, retry with increasing backoff. On DOKS nodes the dbaas + // do-agent sidecar shares a per-droplet rate-limit bucket with the + // system do-node-agent DaemonSet. The metadata proxy enforces a ~10s + // exclusion window between pushes from the same droplet ID. Retrying + // 3 times (10s, 15s, 20s) makes triple-collision probability < 0.05%. + // We reset lastFlushAttempt after each backoff so the internal rate + // limiter doesn't block the next cycle's Flush. + retryBackoffs := []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second} + for attempt, backoff := range retryBackoffs { if resp.StatusCode != http.StatusTooManyRequests { break } if resp.Body != nil { resp.Body.Close() } - log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(c.retryBackoffs), backoff) + log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(retryBackoffs), backoff) time.Sleep(backoff) c.lastFlushAttempt = time.Now() retryReq, retryErr := http.NewRequest("POST", url, bytes.NewBuffer(c.buf.Bytes())) @@ -482,7 +479,7 @@ func (c *HTTPClient) Flush() error { } return err } - log.Debug("retry %d/%d response: %d", attempt+1, len(c.retryBackoffs), resp.StatusCode) + log.Debug("retry %d/%d response: %d", attempt+1, len(retryBackoffs), resp.StatusCode) } contentType := resp.Header.Get(contentTypeHeader) diff --git a/pkg/clients/tsclient/client_test.go b/pkg/clients/tsclient/client_test.go deleted file mode 100644 index 18174ce..0000000 --- a/pkg/clients/tsclient/client_test.go +++ /dev/null @@ -1,183 +0,0 @@ -package tsclient - -import ( - "net/http" - "net/http/httptest" - "sync/atomic" - "testing" - "time" -) - -var fastRetryBackoffs = []time.Duration{1 * time.Millisecond, 1 * time.Millisecond, 1 * time.Millisecond} - -func newTestClient(t *testing.T, serverURL string) *HTTPClient { - t.Helper() - c := New( - WithTrustedAppKey("test-app", "test-key"), - WithWharfEndpoint(serverURL), - ).(*HTTPClient) - c.retryBackoffs = fastRetryBackoffs - return c -} - -func addTestMetric(t *testing.T, c *HTTPClient) { - t.Helper() - def := NewDefinition("test_metric") - if err := c.AddMetric(def, 42.0); err != nil { - t.Fatalf("AddMetric failed: %v", err) - } -} - -func TestFlush_ImmediateSuccess(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(&attempts, 1) - w.WriteHeader(http.StatusAccepted) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - if err := c.Flush(); err != nil { - t.Fatalf("expected success, got %v", err) - } - if n := atomic.LoadInt32(&attempts); n != 1 { - t.Fatalf("expected 1 request (no retries), got %d", n) - } -} - -func TestFlush_EventualSuccess_After429(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - n := atomic.AddInt32(&attempts, 1) - if n == 1 { - w.WriteHeader(http.StatusTooManyRequests) - return - } - w.WriteHeader(http.StatusAccepted) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - if err := c.Flush(); err != nil { - t.Fatalf("expected success after retry, got %v", err) - } - if n := atomic.LoadInt32(&attempts); n != 2 { - t.Fatalf("expected 2 requests (1 initial + 1 retry), got %d", n) - } -} - -func TestFlush_MaxRetriesExhausted(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(&attempts, 1) - w.WriteHeader(http.StatusTooManyRequests) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - err := c.Flush() - if err == nil { - t.Fatal("expected error after exhausting retries, got nil") - } - httpErr, ok := err.(*UnexpectedHTTPStatusError) - if !ok { - t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) - } - if httpErr.StatusCode != http.StatusTooManyRequests { - t.Fatalf("expected status 429, got %d", httpErr.StatusCode) - } - // 1 initial + 3 retries = 4 total attempts - if n := atomic.LoadInt32(&attempts); n != 4 { - t.Fatalf("expected 4 requests (1 initial + 3 retries), got %d", n) - } -} - -func TestFlush_NonRetryableError(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(&attempts, 1) - w.WriteHeader(http.StatusInternalServerError) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - err := c.Flush() - if err == nil { - t.Fatal("expected error for 500, got nil") - } - httpErr, ok := err.(*UnexpectedHTTPStatusError) - if !ok { - t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) - } - if httpErr.StatusCode != http.StatusInternalServerError { - t.Fatalf("expected status 500, got %d", httpErr.StatusCode) - } - if n := atomic.LoadInt32(&attempts); n != 1 { - t.Fatalf("expected 1 request (no retry for 500), got %d", n) - } -} - -func TestFlush_NonRetryable400(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - atomic.AddInt32(&attempts, 1) - w.WriteHeader(http.StatusBadRequest) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - err := c.Flush() - if err == nil { - t.Fatal("expected error for 400, got nil") - } - httpErr, ok := err.(*UnexpectedHTTPStatusError) - if !ok { - t.Fatalf("expected *UnexpectedHTTPStatusError, got %T: %v", err, err) - } - if httpErr.StatusCode != http.StatusBadRequest { - t.Fatalf("expected status 400, got %d", httpErr.StatusCode) - } - if n := atomic.LoadInt32(&attempts); n != 1 { - t.Fatalf("expected 1 request (no retry for 400), got %d", n) - } -} - -func TestFlush_LastFlushAttemptResetOnRetry(t *testing.T) { - var attempts int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - n := atomic.AddInt32(&attempts, 1) - if n <= 2 { - w.WriteHeader(http.StatusTooManyRequests) - return - } - w.WriteHeader(http.StatusAccepted) - })) - defer srv.Close() - - c := newTestClient(t, srv.URL) - addTestMetric(t, c) - - before := time.Now() - if err := c.Flush(); err != nil { - t.Fatalf("expected success, got %v", err) - } - - // lastFlushAttempt should have been reset during retries, - // so it should be after the time we captured before Flush. - if c.lastFlushAttempt.Before(before) { - t.Fatal("expected lastFlushAttempt to be updated during retry backoff") - } - if n := atomic.LoadInt32(&attempts); n != 3 { - t.Fatalf("expected 3 requests (1 initial + 2 retries), got %d", n) - } -} From 97fcf6fbe88642b0c1a57a8cc83c67655c213ffe Mon Sep 17 00:00:00 2001 From: Alex Telpis Date: Mon, 20 Apr 2026 21:11:11 -0500 Subject: [PATCH 10/11] Revert "DBAAS-7956: Remove redundant error-handler sleep to reduce post-failure gap" This reverts commit e1dbecc552a93e1c3fea87983abe24098f609d0e. --- cmd/do-agent/run.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cmd/do-agent/run.go b/cmd/do-agent/run.go index 449318d..564893c 100644 --- a/cmd/do-agent/run.go +++ b/cmd/do-agent/run.go @@ -82,11 +82,9 @@ func run(w metricWriter, l limiter, dec decorate.Decorator, g gatherer, aggregat } log.Error("failed to send metrics: %v", err) - // After 429 retries the agent has already spent ~45s backing off, so - // an additional full-cycle sleep would create an unnecessarily long - // gap (~5 min). Skip it and let the main-loop sleep provide pacing. - // The diagnostic write may be blocked by the client-side rate limiter; - // that's acceptable — diagnostics are best-effort. + // don't send again immediately or it will fail for sending too frequently + // first sleep for the wait duration and then send diagnostic information + time.Sleep(l.WaitDuration()) writeDiagnostics(w, mfs, err) } From 2c0b495df0fb4749129383bf5da95ce454227c29 Mon Sep 17 00:00:00 2001 From: Alex Telpis Date: Mon, 20 Apr 2026 21:11:22 -0500 Subject: [PATCH 11/11] Revert "DBAAS-7956: Add multi-retry with backoff for 429 rate-limit collisions" This reverts commit f4c14d2c7277ea4e49d8f3fd7e4c00f12b3dae71. --- pkg/clients/tsclient/client.go | 40 ---------------------------------- 1 file changed, 40 deletions(-) diff --git a/pkg/clients/tsclient/client.go b/pkg/clients/tsclient/client.go index 32fb716..a965582 100644 --- a/pkg/clients/tsclient/client.go +++ b/pkg/clients/tsclient/client.go @@ -442,46 +442,6 @@ func (c *HTTPClient) Flush() error { } return err } - - // On 429, retry with increasing backoff. On DOKS nodes the dbaas - // do-agent sidecar shares a per-droplet rate-limit bucket with the - // system do-node-agent DaemonSet. The metadata proxy enforces a ~10s - // exclusion window between pushes from the same droplet ID. Retrying - // 3 times (10s, 15s, 20s) makes triple-collision probability < 0.05%. - // We reset lastFlushAttempt after each backoff so the internal rate - // limiter doesn't block the next cycle's Flush. - retryBackoffs := []time.Duration{10 * time.Second, 15 * time.Second, 20 * time.Second} - for attempt, backoff := range retryBackoffs { - if resp.StatusCode != http.StatusTooManyRequests { - break - } - if resp.Body != nil { - resp.Body.Close() - } - log.Debug("got 429, retry %d/%d after %s backoff", attempt+1, len(retryBackoffs), backoff) - time.Sleep(backoff) - c.lastFlushAttempt = time.Now() - retryReq, retryErr := http.NewRequest("POST", url, bytes.NewBuffer(c.buf.Bytes())) - if retryErr != nil { - break - } - retryReq.Header.Add(userAgentHeader, c.userAgent) - if c.wharfEndpointSSLHostname != "" { - retryReq.Host = c.wharfEndpointSSLHostname - } - retryReq.Header.Set(contentTypeHeader, binaryContentType) - retryReq.Header.Add(authKeyHeader, c.appKey) - resp, err = c.httpClient.Do(retryReq.WithContext(context.Background())) - if err != nil { - c.numConsecutiveFailures++ - if c.isZeroTime { - c.clearBufferedMetrics() - } - return err - } - log.Debug("retry %d/%d response: %d", attempt+1, len(retryBackoffs), resp.StatusCode) - } - contentType := resp.Header.Get(contentTypeHeader) if contentType == jsonContentType { defer c.handleSonarResponse(resp.Body)