-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsm-client
More file actions
executable file
·327 lines (269 loc) · 16 KB
/
sm-client
File metadata and controls
executable file
·327 lines (269 loc) · 16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/env python3
import argparse
import logging
import sys
import copy
from prettytable import PrettyTable # type: ignore
from sm_client.data import settings
from sm_client.data.structures import TopologicalSorter2, SMClusterState, NotValid
from sm_client.initialization import sm_get_cluster_state, init_and_check_config
from sm_client.prepare import make_ordered_services_to_process
from sm_client.processing import run_status_procedure, run_dr_or_site_procedure
from sm_client.validation import validate_operation
MAIN_HELP_SECTION = """
Script to manage DR cases in kubernetes Active-Standby scheme
How to use commands:
+------------------------------+ +------------------------------+
| INITIAL CONDITION | | FINAL CONDITION |
+--------------+---------------+--------+--------------+---------------+-----+---------+
| ACTIVE SITE | STANDBY SITE | ===> | ACTIVE SITE | STANDBY SITE | | COMMAND |
+--------------+---------------+ ===> +--------------+---------------+ +---------+
| ACTIVE | STANDBY | ===> | STANDBY | ACTIVE | = | move |
| failed | STANDBY | ===> | stopped | ACTIVE | = | stop |
| stopped | ACTIVE | ===> | STANDBY | ACTIVE | = | return |
| ACTIVE | stopped | ===> | ACTIVE | STANDBY | = | return |
| ACTIVE | STANDBY | ===> | ACTIVE | stopped | = | disable |
+--------------+---------------+--------+--------------+---------------+-----+---------+
"""
SITE_HELP_SECTION = """define the cluster name.
This site:
will be active in case of move
will be standby in case of stop
will be standby in case of return
will be disable to maintenance in case of disable
"""
args: argparse.Namespace
def print_service_order(sm_dict: SMClusterState, cmd, site):
""" Show service list ordered by dependency in debug mode"""
if cmd == "status" or cmd == "list":
# status and list are collected in parallel for all services, there is no order
return
stage = 0
logging.debug("Service order by dependency:")
for elem in settings.module_flow:
flow, flow_cmds = list(elem.items())[0]
# if particular flow performs only particular cmds,
# which are not relevant for current cmd, then we skip this flow
if cmd in ['standby', 'disable', 'return'] and (flow_cmds and flow_cmds == ['active']):
continue
if cmd == 'active' and (flow_cmds and set(flow_cmds) == {'standby', 'disable'}):
continue
# we process all services in this flow in the services order.
# services order is controlled by ts (TopologicalSorter)
ts = copy.deepcopy(sm_dict.globals[flow]['ts'])
while ts and ts.is_active():
stage += 1
logging.debug(f"------ Stage {stage} -------")
# for each svc, print operations which are going to be performed for this svc
stage_services = ts.get_ready()
for svc in stage_services:
# svc operation looks like: "$svc: $cmd on $site", with optional " -> $opposite_cmd on $opposite_site"
# the actual look depends on current cmd (if it is DR cmd or per-site cmd), provided site and current flow_cmds
if cmd in settings.dr_processing_cmd:
if flow_cmds:
actual_site = site
is_standby_during_move = flow_cmds[0] == "standby" and cmd == "move"
is_active_during_stop = flow_cmds[0] == "active" and cmd == "stop"
if is_standby_during_move or is_active_during_stop:
actual_site = settings.sm_conf.get_opposite_site(site)
logging.debug(f"{svc}: {flow_cmds[0]} on {actual_site}")
else:
site_cmd_seq = sm_dict.get_dr_operation_sequence(svc, cmd, site)
operation = " -> ".join(map(lambda site_cmd: f"{site_cmd[1]} on {site_cmd[0]}", site_cmd_seq))
logging.debug(f"{svc}: {operation}")
else:
logging.debug(f"{svc}: {cmd} on {site}")
# mark svc as done, so next time TopologicalSorter will give us next stage, if any left for this flow
ts.done(svc)
logging.debug("------ Done ------")
def run(services: list = None, cmd="", site=""):
""" Business Logic - implements main flow """
# init SMClusterState object ann get status for all sites in case DR procedure or specific site in case cmd
sm_dict = sm_get_cluster_state(None)
# assemble ordered service list to proceed, keeping in services specified in cli
site_to_order = site if cmd not in ["stop", "move"] else None
for mod_i in settings.sm_conf.get_modules():
service_dep_ordered, return_code, ts = make_ordered_services_to_process(
sm_dict, site_to_order, services_to_process = services, module=mod_i)
# if can't order all sites services for failover, run it for opposite site
if ts is None and cmd == "stop":
opposite_site = settings.sm_conf.get_opposite_site(site)
logging.warning(f"Module: {mod_i}, can't make services order for available site, "
f"trying to make order for site {opposite_site}...")
service_dep_ordered, return_code, ts = make_ordered_services_to_process(sm_dict,
opposite_site,
services_to_process=services,
module=mod_i)
if ts:
logging.info(f"Module: {mod_i}, Service order creation finished successfully")
elif return_code:
logging.info(f"Module: {mod_i}, Service order creation finished, no running services have desired module")
else:
logging.error(f"Module: {mod_i}, Service order creation failed")
sm_dict.globals[mod_i]['service_dep_ordered'] = service_dep_ordered
sm_dict.globals[mod_i]['ts'] = ts
sm_dict.globals[mod_i]['deps_issue'] = not return_code
logging.debug(f"Module:{mod_i} list:{sm_dict.globals[mod_i]['service_dep_ordered']} "
f"deps_issue:{sm_dict.globals[mod_i]['deps_issue']}")
# validation to satisfy cmd and current site status
service_dep_ordered = []
ts = sm_dict.globals[settings.default_module]['ts']
try:
for mod_i in settings.sm_conf.get_modules():
services_list = validate_operation(sm_dict, cmd, site, services, mod_i)
service_dep_ordered.extend(services_list)
logging.debug(f"Service order {service_dep_ordered}")
except NotValid:
sys.exit(1)
print_service_order(sm_dict, cmd, site)
if settings.dry_run: # Check if it's a dry run
logging.info("Dry run mode enabled. Operation will not be executed.")
sys.exit(0) # Exit with success status
settings.ignored_services.extend(sm_dict.make_ignored_services(service_dep_ordered))
# main flow by command
if cmd in "status":
run_status_procedure(sm_dict, service_dep_ordered)
print_main_table(sm_dict, service_dep_ordered, [site] if site else list(settings.sm_conf.keys()))
elif cmd in "list":
print("---------------------------------------------------------------------\n" +
f"Sites managed by site-manager: {list(sm_dict.keys())}\n\n" +
f"Kubernetes services managed by site-manager: {sm_dict.get_services_list_for_ok_site()}\n" +
f"Kubernetes services that will be processed: {service_dep_ordered}\n" +
"---------------------------------------------------------------------")
elif cmd in settings.site_cmds + settings.dr_procedures: # per site command or DR procedure
print_service_operation_summary("top", sm_dict, service_dep_ordered, cmd, site)
run_dr_or_site_procedure(sm_dict, cmd, site)
print_service_operation_summary("tail",sm_dict, service_dep_ordered, cmd, site)
else:
logging.error(f"Unknown combination of {cmd} {site} options")
sys.exit(1)
if len(settings.failed_services) != 0:
logging.fatal(f"Some services finished {cmd} with failed status")
sys.exit(1)
return True
def print_service_operation_summary(part, sm_dict: SMClusterState, services_to_run: list, cmd="", site=""):
"""Print summary info"""
if part in "top":
logging.info("---------------------------------------------------------------------")
logging.info(f"Procedure: {cmd}")
logging.info(f"Active sites: {settings.sm_conf.get_active_site(cmd, site)}")
logging.info(f"Standby sites: {settings.sm_conf.get_opposite_site(settings.sm_conf.get_active_site(cmd, site))}")
logging.info(f"Kubernetes services managed by site-manager: {sm_dict.get_services_list_for_ok_site()}")
logging.info(f"Kubernetes services that will be processed: {services_to_run}")
logging.info("---------------------------------------------------------------------")
elif part in "tail":
logging.info("---------------------------------------------------------------------")
logging.info("Summary:")
logging.info(f"services that successfully done: {list(dict.fromkeys(settings.done_services))}")
logging.info(f"services that failed: {settings.failed_services}")
logging.info(f"services that warned: {settings.warned_services}")
logging.info(f"services that skipped due to dependency: {settings.skipped_due_deps_services}")
logging.info(f"services that ignored: {settings.ignored_services}")
logging.info("---------------------------------------------------------------------")
def print_main_table(sm_dict: SMClusterState, services_to_run, sites_name: list):
""" Method intended to display main section of status table
@param dict sm_dict: the results of the procedure received from the site-manager
@param list services_to_run: the list of services that have been processed
@param list sites_name: list of cluster names
"""
def make_table(header, sites_name):
"""
Method for creating the main parts of the table
"""
pt=PrettyTable()
pt_field_names=["Service"]
for sites_item in sites_name:
pt_field_names.append(f"{sites_item}")
pt.field_names=pt_field_names
pt.align["Service"]="l"
# Additional header
comment_pt_row = [""]
separator_pt_row = ["--------------------------"]
for sites_item in sites_name:
comment_pt_row.append(header)
separator_pt_row.append("------------------------------------")
pt.add_row(comment_pt_row)
pt.add_row(separator_pt_row)
pt.max_width = 50
return pt
pt = make_table("mode | DR status | healthz | message", sites_name)
for service_item in services_to_run:
service_pt_row = []
service_pt_row.append(service_item)
for sites_item in sites_name:
if sm_dict[sites_item]['status'] and sm_dict[sites_item]['services'].get(service_item) and \
sm_dict[sites_item]['services'][service_item].get('status'):
service_pt_row.append(
f"{sm_dict[sites_item]['services'][service_item]['status']['mode']} / "
f"{sm_dict[sites_item]['services'][service_item]['status']['status']} / "
f"{sm_dict[sites_item]['services'][service_item]['status']['healthz']} / "
f"{sm_dict[sites_item]['services'][service_item]['status']['message']}")
else:
service_pt_row.append("-- / -- / -- /")
pt.add_row(service_pt_row)
print(pt)
def parse_command_line(command_args) -> argparse.Namespace:
""" Main argument parser
@return:
"""
parser = argparse.ArgumentParser(description=MAIN_HELP_SECTION,
argument_default=argparse.SUPPRESS,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-v', '--verbose', default=False, action='store_true', help='enable the verbosity mode')
parser.add_argument('-c', '--config', default="", help='define the path to configuration file')
parser.add_argument('-f', '--force', default=False, action='store_true', help='force apply DR action and ignore healthz')
parser.add_argument('-k', '--insecure', default=False, action='store_true', help='enable self-signed certificates')
parser.add_argument('-o', '--output', default="", help='define the filename for logging output')
parser.add_argument('-r', '--ignore-restrictions', default=False, action='store_true', help='skip state restrictions validation')
parser.add_argument('--run-services', default='', help='define the list of services to apply DR action, by default all services participate')
parser.add_argument('--skip-services', default='', help='define the list of services what will not participate in DR action')
parser.add_argument('--dry-run', default=False, action='store_true',
help='perform a dry run without actually executing the operation')
subparsers = parser.add_subparsers()
subparsers.required=True
parser_1 = subparsers.add_parser('move', help='move Active functionality to Standby site') # DR switchover
parser_1.add_argument('site', help=SITE_HELP_SECTION)
parser_1.set_defaults(command='move')
parser_2 = subparsers.add_parser('stop', help='excludes site from Active-Standby scheme') # DR failover
parser_2.add_argument('site', help=SITE_HELP_SECTION)
parser_2.set_defaults(command='stop')
parser_3 = subparsers.add_parser('return', help='return stopped Kubernetes cluster to Standby role')
parser_3.add_argument('site', help=SITE_HELP_SECTION)
parser_3.set_defaults(command='return')
parser_4 = subparsers.add_parser('disable', help='stop Standby kubernetes cluster for maintenance')
parser_4.add_argument('site', help=SITE_HELP_SECTION)
parser_4.set_defaults(command='disable')
parser_5 = subparsers.add_parser('active', help='set kubernetes cluster services to active mode')
parser_5.add_argument('site', help=SITE_HELP_SECTION)
parser_5.set_defaults(command='active')
parser_6 = subparsers.add_parser('standby', help='set kubernetes cluster services to standby mode')
parser_6.add_argument('site', help=SITE_HELP_SECTION)
parser_6.set_defaults(command='standby')
parser_7 = subparsers.add_parser('list', help='list all services from Active-Standby scheme managed by site-manager with dependencies')
parser_7.add_argument('site', nargs='?', default=None, help=SITE_HELP_SECTION)
parser_7.set_defaults(command='list')
parser_8 = subparsers.add_parser('status', help='show current status of clusters and all services')
parser_8.add_argument('site', nargs='?', default=None, help=SITE_HELP_SECTION) # todo to update help
parser_8.set_defaults(command='status')
parser_9 = subparsers.add_parser('version', help='get current version')
parser_9.set_defaults(command='version')
return parser.parse_args(args=command_args)
def main(command_args=None):
"""Main function"""
global args
args = parse_command_line(command_args)
settings.dry_run = args.dry_run
# get version command
if args.command in "version":
with open('./version', 'r') as f:
print(f"SM-client {f.read()}")
sys.exit(0)
if not init_and_check_config(args):
sys.exit(1)
settings.ignored_services.extend(settings.skip_services)
if not run([i for i in settings.run_services if i not in settings.skip_services], args.command,
args.site if hasattr(args, 'site') else False):
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()