Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: 'influxdb',
rules: [
{
alert: 'InfluxDBWarningTaskSchedulerHighFailureRate',
alert: 'InfluxDBWarningTaskHighFailureRate',
expr: |||
100 * rate(task_scheduler_total_execute_failure[5m])/clamp_min(rate(task_scheduler_total_execution_calls[5m]), 1) >= %(alertsWarningTaskSchedulerHighFailureRate)s
||| % $._config,
100 * rate(task_scheduler_total_execute_failure{%(filteringSelector)s}[5m])/clamp_min(rate(task_scheduler_total_execution_calls{%(filteringSelector)s}[5m]), 1) >= %(alertsWarningTaskSchedulerHighFailureRate)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -19,14 +19,14 @@
(
'Task scheduler task executions for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} are failing at a rate of {{ printf "%%.0f" $value }} percent, ' +
'which is above the threshold of %(alertsWarningTaskSchedulerHighFailureRate)s percent.'
) % $._config,
) % this.config,
},
},
{
alert: 'InfluxDBCriticalTaskSchedulerHighFailureRate',
alert: 'InfluxDBCriticalTaskHighFailureRate',
expr: |||
100 * rate(task_scheduler_total_execute_failure[5m])/clamp_min(rate(task_scheduler_total_execution_calls[5m]), 1) >= %(alertsCriticalTaskSchedulerHighFailureRate)s
||| % $._config,
100 * rate(task_scheduler_total_execute_failure{%(filteringSelector)s}[5m])/clamp_min(rate(task_scheduler_total_execution_calls{%(filteringSelector)s}[5m]), 1) >= %(alertsCriticalTaskSchedulerHighFailureRate)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -37,14 +37,14 @@
(
'Task scheduler task executions for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} are failing at a rate of {{ printf "%%.0f" $value }} percent, ' +
'which is above the threshold of %(alertsCriticalTaskSchedulerHighFailureRate)s percent.'
) % $._config,
) % this.config,
},
},
{
alert: 'InfluxDBHighBusyWorkerPercentage',
expr: |||
task_executor_workers_busy >= %(alertsWarningHighBusyWorkerPercentage)s
||| % $._config,
task_executor_workers_busy{%(filteringSelector)s} >= %(alertsWarningHighBusyWorkerPercentage)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -55,14 +55,14 @@
(
'The busy worker percentage for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.0f" $value }} percent, ' +
'which is above the threshold of %(alertsWarningHighBusyWorkerPercentage)s percent.'
) % $._config,
) % this.config,
},
},
{
alert: 'InfluxDBHighHeapMemoryUsage',
expr: |||
100 * go_memstats_heap_alloc_bytes/clamp_min((go_memstats_heap_idle_bytes + go_memstats_heap_alloc_bytes), 1) >= %(alertsWarningHighHeapMemoryUsage)s
||| % $._config,
100 * go_memstats_heap_alloc_bytes{%(filteringSelector)s}/clamp_min((go_memstats_heap_idle_bytes{%(filteringSelector)s} + go_memstats_heap_alloc_bytes{%(filteringSelector)s}), 1) >= %(alertsWarningHighHeapMemoryUsage)s
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -73,14 +73,14 @@
(
'The heap memory usage for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.0f" $value }} percent, ' +
'which is above the threshold of %(alertsWarningHighHeapMemoryUsage)s percent.'
) % $._config,
) % this.config,
},
},
{
alert: 'InfluxDBHighAverageAPIRequestLatency',
expr: |||
sum without(handler, method, path, response_code, status, user_agent) (increase(http_api_request_duration_seconds_sum[5m])/clamp_min(increase(http_api_requests_total[5m]), 1)) >= %(alertsWarningHighAverageAPIRequestLatency)s
||| % $._config,
sum without(handler, method, path, response_code, status, user_agent) (increase(http_api_request_duration_seconds_sum{%(filteringSelector)s}[5m])/clamp_min(increase(http_api_requests_total{%(filteringSelector)s}[5m]), 1)) >= %(alertsWarningHighAverageAPIRequestLatency)s
||| % this.config,
'for': '1m',
labels: {
severity: 'critical',
Expand All @@ -90,14 +90,14 @@
description:
(
'The average API request latency for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.2f" $value }} seconds, which is above the threshold of %(alertsWarningHighAverageAPIRequestLatency)s seconds.'
) % $._config,
) % this.config,
},
},
{
alert: 'InfluxDBSlowAverageIQLExecutionTime',
expr: |||
sum without(result) (increase(influxql_service_executing_duration_seconds_sum[5m])/clamp_min(increase(influxql_service_requests_total[5m]), 1)) >= %(alertsWarningSlowAverageIQLExecutionTime)s
||| % $._config,
sum without(result) (increase(influxql_service_executing_duration_seconds_sum{%(filteringSelector)s}[5m])/clamp_min(increase(influxql_service_requests_total{%(filteringSelector)s}[5m]), 1)) >= %(alertsWarningSlowAverageIQLExecutionTime)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -108,7 +108,7 @@
(
'The average InfluxQL query execution time for instance {{$labels.instance}} on cluster {{$labels.influxdb_cluster}} is {{ printf "%%.2f" $value }} seconds, ' +
'which is above the threshold of %(alertsWarningSlowAverageIQLExecutionTime)s seconds.'
) % $._config,
) % this.config,
},
},
],
Expand Down
49 changes: 32 additions & 17 deletions influxdb-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,23 +1,38 @@
{
_config+:: {
enableMultiCluster: false,
influxdbSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
multiclusterSelector: 'job=~"$job"',
filterSelector: 'job=~"integrations/influxdb"',
local this = self,
filteringSelector: 'job="integrations/influxdb"',
groupLabels: ['job', 'influxdb_cluster'],
instanceLabels: ['instance'],
dashboardTags: ['influxdb-mixin'],
uid: 'influxdb',
dashboardNamePrefix: 'InfluxDB',

dashboardTags: ['influxdb-mixin'],
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
// additional params
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',

// alerts thresholds
alertsWarningTaskSchedulerHighFailureRate: 25, // %
alertsCriticalTaskSchedulerHighFailureRate: 50, // %
alertsWarningHighBusyWorkerPercentage: 80, // %
alertsWarningHighHeapMemoryUsage: 80, // %
alertsWarningHighAverageAPIRequestLatency: 0.3, // count
alertsWarningSlowAverageIQLExecutionTime: 0.1, // count
// logs lib related
enableLokiLogs: true,
logLabels: ['job', 'instance', 'influxdb_cluster', 'level'],
extraLogLabels: [], // Required by logs-lib
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
// alert thresholds
alertsWarningTaskSchedulerHighFailureRate: 25, // %
alertsCriticalTaskSchedulerHighFailureRate: 50, // %
alertsWarningHighBusyWorkerPercentage: 80, // %
alertsWarningHighHeapMemoryUsage: 80, // %
alertsWarningHighAverageAPIRequestLatency: 0.3, // count
alertsWarningSlowAverageIQLExecutionTime: 0.1, // count

// metrics source for signals library
metricsSource: 'prometheus',

legendCustomTemplate: std.join(' ', std.map(function(label) '{{' + label + '}}', this.instanceLabels)),
signals+: {
overview: (import './signals/overview.libsonnet')(this),
instance: (import './signals/instance.libsonnet')(this),
},
}
127 changes: 127 additions & 0 deletions influxdb-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
local g = import '../g.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;

{
// InfluxDB cluster overview dashboard
'influxdb-cluster-overview.json':
g.dashboard.new(prefix + ' cluster overview')
+ g.dashboard.withDescription('Dashboard providing an overview of InfluxDB cluster performance and health.')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.influxdbClusterOverview,
this.grafana.rows.influxdbClusterOverviewQueriesAndOperations,
this.grafana.rows.influxdbClusterOverviewTaskScheduler,
this.grafana.rows.influxdbClusterOverviewMemoryAndGC,
]
)
)
)
+ root.applyCommon(
std.filter(
function(x) x.name != 'instance',
vars.multiInstance,
) + [
g.dashboard.variable.custom.new(
'k',
values=['5', '10', '20', '50'],
) + g.dashboard.variable.custom.generalOptions.withCurrent('5')
+ g.dashboard.variable.custom.generalOptions.withLabel('Top node count')
+ g.dashboard.variable.custom.selectionOptions.withMulti(false)
+ g.dashboard.variable.custom.selectionOptions.withIncludeAll(false),
],
uid + '_cluster_overview',
tags,
links { influxdbClusterOverview+:: {} },
annotations,
timezone,
refresh,
period
),

// InfluxDB instance overview dashboard
'influxdb-instance-overview.json':
g.dashboard.new(prefix + ' instance overview')
+ g.dashboard.withDescription('Dashboard providing detailed overview of InfluxDB instance performance, including configuration stats, Go runtime performance, query/request load, and task scheduler activity.')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.influxdbInstanceOverview,
this.grafana.rows.influxdbInstanceOverviewQueriesAndOperations,
this.grafana.rows.influxdbInstanceOverviewTaskScheduler,
this.grafana.rows.influxdbInstanceOverviewGo,
]
)
)
)
+ root.applyCommon(
vars.multiInstance,
uid + '_instance_overview',
tags,
links { influxdbInstanceOverview+:: {} },
annotations,
timezone,
refresh,
period
),
}
+
if this.config.enableLokiLogs then
{
'influxdb-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
3 changes: 0 additions & 3 deletions influxdb-mixin/dashboards/dashboards.libsonnet

This file was deleted.

Loading
Loading