Skip to content

Commit 339ca8f

Browse files
committed
TNTP-2109: Add rebalance related metrics
1 parent 5f3f46f commit 339ca8f

File tree

3 files changed

+143
-6
lines changed

3 files changed

+143
-6
lines changed

crud.lua

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ crud.rebalance.router_cache_length = rebalance.router.cache_length
174174
crud.rebalance.router_cache_last_clear_ts = rebalance.router.cache_last_clear_ts
175175

176176
function crud.init_router()
177-
rawset(_G, 'crud', crud)
177+
rawset(_G, 'crud', crud)
178+
rebalance.metrics.enable_router_metrics()
178179
end
179180

180181
function crud.stop_router()

crud/common/rebalance.lua

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@ local log = require('log')
33
local vshard_consts = require('vshard.consts')
44
local utils = require('crud.common.utils')
55

6+
local has_metrics_module, metrics = pcall(require, 'metrics')
7+
68
local SETTINGS_SPACE_NAME = '_crud_settings'
79
local SAFE_MOD_ENABLE_EVENT = '_crud.safe_mode_enable'
810

9-
1011
local M = {
1112
safe_mode = false,
1213
safe_mode_enable_hooks = {},
@@ -85,6 +86,8 @@ local function safe_mode_disable()
8586
end
8687

8788
local function rebalance_init()
89+
M.metrics.enable_storage_metrics()
90+
8891
-- box.watch was introduced in tarantool 2.10.0
8992
if not utils.tarantool_supports_box_watch() then
9093
log.warn('This version of tarantool does not support autoswitch to safe mode during rebalance. '
@@ -131,20 +134,54 @@ local function rebalance_stop()
131134
end
132135

133136
local function router_cache_clear()
134-
local r = utils.get_vshard_router_instance()
135137
M._router_cache_last_clear_ts = fiber.time()
136-
return r:_route_map_clear()
138+
return utils.get_vshard_router_instance():_route_map_clear()
137139
end
138140

139141
local function router_cache_length()
140-
local r = utils.get_vshard_router_instance()
141-
return r.known_bucket_count
142+
return utils.get_vshard_router_instance().known_bucket_count
142143
end
143144

144145
local function router_cache_last_clear_ts()
145146
return M._router_cache_last_clear_ts
146147
end
147148

149+
-- Rebalance related metrics
150+
local function enable_storage_metrics()
151+
if not has_metrics_module then
152+
return
153+
end
154+
155+
local safe_mode_enabled_gauge = metrics.gauge(
156+
'tnt_crud_storage_safe_mode_enabled',
157+
"is safe mode enabled on this storage instance"
158+
)
159+
160+
metrics.register_callback(function()
161+
safe_mode_enabled_gauge:set(safe_mode_status() and 1 or 0)
162+
end)
163+
end
164+
165+
local function enable_router_metrics()
166+
if not has_metrics_module then
167+
return
168+
end
169+
170+
local router_cache_length_gauge = metrics.gauge(
171+
'tnt_crud_router_cache_length',
172+
"number of bucket routes in vshard router cache"
173+
)
174+
local router_cache_last_clear_ts_gauge = metrics.gauge(
175+
'tnt_crud_router_cache_last_clear_ts',
176+
"when vshard router cache was cleared last time"
177+
)
178+
179+
metrics.register_callback(function()
180+
router_cache_length_gauge:set(router_cache_length())
181+
router_cache_last_clear_ts_gauge:set(router_cache_last_clear_ts())
182+
end)
183+
end
184+
148185
M.init = rebalance_init
149186
M.stop = rebalance_stop
150187
M.safe_mode_status = safe_mode_status
@@ -167,4 +204,9 @@ M.storage_api = {
167204
rebalance_safe_mode_disable = safe_mode_disable,
168205
}
169206

207+
M.metrics = {
208+
enable_storage_metrics = enable_storage_metrics,
209+
enable_router_metrics = enable_router_metrics,
210+
}
211+
170212
return M

test/integration/metrics_test.lua

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
local helpers = require('test.helper')
2+
local t = require('luatest')
3+
4+
local pgroup = t.group('metrics_integration', helpers.backend_matrix({
5+
{engine = 'memtx'},
6+
}))
7+
8+
local function before_all(g)
9+
helpers.start_default_cluster(g, 'srv_stats')
10+
end
11+
12+
local function after_all(g)
13+
helpers.stop_cluster(g.cluster, g.params.backend)
14+
end
15+
16+
local function before_each(g)
17+
g.router:eval("crud = require('crud')")
18+
helpers.call_on_storages(g.cluster, function(server)
19+
server:call('_crud.rebalance_safe_mode_disable')
20+
end)
21+
end
22+
23+
pgroup.before_all(before_all)
24+
25+
pgroup.after_all(after_all)
26+
27+
pgroup.before_each(before_each)
28+
29+
pgroup.test_safe_mode_metrics = function(g)
30+
local has_metrics_module = require('metrics')
31+
t.skip_if(not has_metrics_module, 'No metrics module in current version')
32+
33+
-- Check safe mode metric on storage
34+
helpers.call_on_storages(g.cluster, function(server)
35+
local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
36+
local has_metric = false
37+
for _, m in pairs(observed) do
38+
if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then
39+
t.assert_equals(m.value, 0, 'Metric shows safe mode disabled')
40+
has_metric = true
41+
break
42+
end
43+
end
44+
if not has_metric then
45+
t.fail('No tnt_crud_storage_safe_mode_enabled metric found')
46+
end
47+
end)
48+
49+
-- Enable safe mode
50+
helpers.call_on_storages(g.cluster, function(server)
51+
server:call('_crud.rebalance_safe_mode_enable')
52+
end)
53+
54+
-- Check that metric value has changed
55+
helpers.call_on_storages(g.cluster, function(server)
56+
local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
57+
local has_metric = false
58+
for _, m in pairs(observed) do
59+
if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then
60+
t.assert_equals(m.value, 1, 'Metric shows safe mode enabled')
61+
has_metric = true
62+
break
63+
end
64+
end
65+
if not has_metric then
66+
t.fail('No tnt_crud_storage_safe_mode_enabled metric found')
67+
end
68+
end)
69+
70+
-- Check router cache metric
71+
local observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
72+
local first_ts = 0
73+
for _, m in pairs(observed) do
74+
if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then
75+
first_ts = m.value
76+
break
77+
end
78+
end
79+
t.assert_gt(first_ts, 0, 'Last cache clear TS is greater than zero')
80+
81+
-- Clear router cache
82+
g.router:eval("crud.rebalance.router_cache_clear()")
83+
84+
-- Check that last_clear_ts has changed
85+
observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
86+
local new_ts = 0
87+
for _, m in pairs(observed) do
88+
if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then
89+
new_ts = m.value
90+
break
91+
end
92+
end
93+
t.assert_gt(new_ts, first_ts, 'Last cache clear TS is greater than the first one')
94+
end

0 commit comments

Comments
 (0)