Skip to content

Commit ae866c7

Browse files
authored
xapi_ha: avoid raising Not_found when joining a liveset (#6734)
An xcp-ng user reported a failure when enabling HA, with the only error being a Not_found, with this change we now know that it's because the IP of the coordinator is not present in the local database's `ha_peers` value. While doing this: - I also removed several finds and hd to reduce these kind of exceptions and make the problematic uses easier to find in the future - The localdb interface was rebustified by using options - the cluster_stack values were reified and consolidated into constants, they are problematic because they are strings instead of an enum, but we can tame them a bit
2 parents 42e47b6 + 17e0edd commit ae866c7

20 files changed

+246
-261
lines changed

ocaml/sdk-gen/c/gen_c_binding.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ and gen_impl cls =
346346
; ("async_params", `A (List.map paramJson (asyncParams msg)))
347347
; ("msg_params", `A (List.map paramJson msg.msg_params))
348348
; ("abstract_result_type", `String (result_type msg))
349-
; ("has_params", `Bool (List.length msg.msg_params <> 0))
349+
; ("has_params", `Bool (msg.msg_params <> []))
350350
; ("param_count", `String (string_of_int (List.length msg.msg_params)))
351351
; ("has_result", `Bool (String.compare (result_type msg) "" <> 0))
352352
; ("init_result", `Bool (init_result msg))

ocaml/xapi-consts/constants.ml

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,12 +212,29 @@ let ballooning_enabled = "ballooning.enabled"
212212
let redo_log_enabled = "redo_log.enabled"
213213

214214
(* Valid cluster stack values *)
215-
let ha_cluster_stack = "ha_cluster_stack"
215+
module Ha_cluster_stack = struct
216+
type t = Xhad | Corosync
216217

217-
let default_smapiv3_cluster_stack = "corosync"
218+
let key = "ha_cluster_stack"
218219

219-
(* Note: default without clustering is in !Xapi_globs.default_cluster_stack *)
220-
let supported_smapiv3_cluster_stacks = ["corosync"]
220+
let to_string = function Xhad -> "xhad" | Corosync -> "corosync"
221+
222+
let of_string = function
223+
| "xhad" ->
224+
Some Xhad
225+
| "corosync" ->
226+
Some Corosync
227+
| _ ->
228+
None
229+
end
230+
231+
let ha_cluster_stack = Ha_cluster_stack.key
232+
233+
let default_cluster_stack = Ha_cluster_stack.(to_string Xhad)
234+
235+
let default_smapiv3_cluster_stack = Ha_cluster_stack.(to_string Corosync)
236+
237+
let supported_smapiv3_cluster_stacks = [default_smapiv3_cluster_stack]
221238

222239
(* Set in the local db to cause us to emit an alert when we come up as a master after
223240
a transition or HA failover *)

ocaml/xapi/create_storage.ml

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -79,24 +79,20 @@ let maybe_create_pbd rpc session_id sr device_config me =
7979
(fun self -> Client.PBD.get_host ~rpc ~session_id ~self = me)
8080
pbds
8181
in
82-
(* Check not more than 1 pbd in the database *)
83-
let pbds =
84-
if List.length pbds > 1 then (
85-
(* shouldn't happen... delete all but first pbd to make db consistent again *)
86-
List.iter
87-
(fun pbd -> Client.PBD.destroy ~rpc ~session_id ~self:pbd)
88-
(List.tl pbds) ;
89-
[List.hd pbds]
90-
) else
91-
pbds
92-
in
93-
if pbds = [] (* If there's no PBD, create it *) then
82+
let create () : [`PBD] Ref.t =
9483
Client.PBD.create ~rpc ~session_id ~host:me ~sR:sr ~device_config
9584
~other_config:[]
96-
else
97-
List.hd pbds
98-
99-
(* Otherwise, return the current one *)
85+
in
86+
(* Ensure there's a single PBD *)
87+
match pbds with
88+
| [] ->
89+
ignore (create ())
90+
| [_] ->
91+
()
92+
| _ :: pbds ->
93+
(* shouldn't happen... delete all but first pbd to make db consistent
94+
again *)
95+
List.iter (fun pbd -> Client.PBD.destroy ~rpc ~session_id ~self:pbd) pbds
10096

10197
let maybe_remove_tools_sr rpc session_id __context =
10298
let srs = Db.SR.get_all ~__context in
@@ -153,17 +149,13 @@ let initialise_storage (me : API.ref_host) rpc session_id __context : unit =
153149
List.filter (fun (_, pbd_rec) -> pbd_rec.API.pBD_host = master) pbds
154150
in
155151
let maybe_create_pbd_for_shared_sr s =
156-
let _, mpbd_rec =
157-
List.find (fun (_, pbdr) -> pbdr.API.pBD_SR = s) master_pbds
158-
in
159-
let master_devconf = mpbd_rec.API.pBD_device_config in
160-
let my_devconf = List.remove_assoc "SRmaster" master_devconf in
161-
(* this should never be used *)
162-
maybe_create_pbd rpc session_id s my_devconf me
152+
List.find_opt (fun (_, pbdr) -> pbdr.API.pBD_SR = s) master_pbds
153+
|> Option.iter @@ fun (_, mpbd_rec) ->
154+
let master_devconf = mpbd_rec.API.pBD_device_config in
155+
let my_devconf = List.remove_assoc "SRmaster" master_devconf in
156+
try maybe_create_pbd rpc session_id s my_devconf me with _ -> ()
163157
in
164-
List.iter
165-
(fun s -> try ignore (maybe_create_pbd_for_shared_sr s) with _ -> ())
166-
shared_sr_refs
158+
List.iter maybe_create_pbd_for_shared_sr shared_sr_refs
167159
in
168160
let other_config =
169161
try
@@ -173,9 +165,8 @@ let initialise_storage (me : API.ref_host) rpc session_id __context : unit =
173165
in
174166
if
175167
not
176-
(List.mem_assoc Xapi_globs.sync_create_pbds other_config
177-
&& List.assoc Xapi_globs.sync_create_pbds other_config
178-
= Xapi_globs.sync_switch_off
168+
(List.assoc_opt Xapi_globs.sync_create_pbds other_config
169+
= Some Xapi_globs.sync_switch_off
179170
)
180171
then (
181172
debug "Creating PBDs for shared SRs" ;

ocaml/xapi/localdb.ml

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,17 +64,27 @@ exception Missing_key of string
6464
let m = Mutex.create ()
6565

6666
let get (key : string) =
67+
let __FUN = __FUNCTION__ in
68+
let ( let* ) = Option.bind in
6769
with_lock m (fun () ->
68-
assert_loaded () ;
69-
match Hashtbl.find_opt db key with
70-
| Some x ->
71-
x
72-
| None ->
73-
raise (Missing_key key)
70+
let* () =
71+
try assert_loaded () ; Some ()
72+
with e ->
73+
warn "%s: unexpected error, ignoring it: %s" __FUN
74+
(Printexc.to_string e) ;
75+
None
76+
in
77+
Hashtbl.find_opt db key
7478
)
7579

76-
let get_with_default (key : string) (default : string) =
77-
try get key with Missing_key _ -> default
80+
let get_exn key =
81+
match get key with Some x -> x | None -> raise (Missing_key key)
82+
83+
let get_of_string of_string key = Option.bind (get key) of_string
84+
85+
let get_bool key = get_of_string bool_of_string_opt key
86+
87+
let get_int key = get_of_string int_of_string_opt key
7888

7989
(* Returns true if a change was made and should be flushed *)
8090
let put_one (key : string) (v : string) =

ocaml/xapi/localdb.mli

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,24 @@
1818
(** Thrown when a particular named key could not be found. *)
1919
exception Missing_key of string
2020

21-
val get : string -> string
21+
val get : string -> string option
2222
(** Retrieves a value *)
2323

24-
val get_with_default : string -> string -> string
25-
(** [get_with_default key default] returns the value associated with [key],
26-
or [default] if the key is missing. *)
24+
val get_exn : string -> string
25+
(** Retrieves the value for the key, raises Missing_key when the key is not
26+
present *)
27+
28+
val get_bool : string -> bool option
29+
(** Retrieves the value for the key, returns a value when it's found and is a
30+
valid boolean, otherwise is [None] *)
31+
32+
val get_int : string -> int option
33+
(** Retrieves the value for the key, returns a value when it's found and is a
34+
valid int, otherwise is [None] *)
35+
36+
val get_of_string : (string -> 'a option) -> string -> 'a option
37+
(** [get_of_string of_string key] retrieves the value for [key], and if it
38+
exists, processes it with [of_string], otherwise it's [None] *)
2739

2840
val put : string -> string -> unit
2941
(** Inserts a value into the database, only returns when the insertion has

ocaml/xapi/repository.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ let sync ~__context ~self ~token ~token_id ~username ~password =
232232
Xapi_stdext_pervasives.Pervasiveext.finally
233233
(fun () ->
234234
let config_repo config =
235-
if List.length config <> 0 then (* Set params to yum/dnf *)
235+
if config <> [] then (* Set params to yum/dnf *)
236236
let Pkg_mgr.{cmd; params} = Pkgs.config_repo ~repo_name ~config in
237237
ignore
238238
(Helpers.call_script ~log_output:Helpers.On_failure cmd params)

ocaml/xapi/system_domains.ml

Lines changed: 0 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -148,36 +148,6 @@ let is_in_use ~__context ~self =
148148
else
149149
false
150150

151-
(* [wait_for ?timeout f] returns true if [f()] (called at 1Hz) returns true within
152-
the [timeout] period and false otherwise *)
153-
let wait_for ?(timeout = 120.) f =
154-
let start = Unix.gettimeofday () in
155-
let finished = ref false in
156-
let success = ref false in
157-
while not !finished do
158-
let remaining = timeout -. (Unix.gettimeofday () -. start) in
159-
if remaining < 0. then
160-
finished := true
161-
else
162-
try
163-
if f () then (
164-
success := true ;
165-
finished := true
166-
) else
167-
Thread.delay 1.
168-
with _ -> Thread.delay 1.
169-
done ;
170-
!success
171-
172-
let pingable ip () =
173-
try
174-
let (_ : string * string) =
175-
Forkhelpers.execute_command_get_output "/bin/ping"
176-
["-c"; "1"; "-w"; "1"; ip]
177-
in
178-
true
179-
with _ -> false
180-
181151
let queryable ~__context transport () =
182152
let open Xmlrpc_client in
183153
let tracing = Context.set_client_span __context in
@@ -197,47 +167,6 @@ let queryable ~__context transport () =
197167
(Printexc.to_string e) ;
198168
false
199169

200-
let ip_of ~__context driver =
201-
(* Find the VIF on the Host internal management network *)
202-
let vifs = Db.VM.get_VIFs ~__context ~self:driver in
203-
let hin = Helpers.get_host_internal_management_network ~__context in
204-
let ip =
205-
let vif =
206-
try
207-
List.find
208-
(fun vif -> Db.VIF.get_network ~__context ~self:vif = hin)
209-
vifs
210-
with Not_found ->
211-
failwith
212-
(Printf.sprintf
213-
"driver domain %s has no VIF on host internal management network"
214-
(Ref.string_of driver)
215-
)
216-
in
217-
match Xapi_udhcpd.get_ip ~__context vif with
218-
| Some (a, b, c, d) ->
219-
Printf.sprintf "%d.%d.%d.%d" a b c d
220-
| None ->
221-
failwith
222-
(Printf.sprintf
223-
"driver domain %s has no IP on the host internal management \
224-
network"
225-
(Ref.string_of driver)
226-
)
227-
in
228-
info "driver domain uuid:%s ip:%s" (Db.VM.get_uuid ~__context ~self:driver) ip ;
229-
if not (wait_for (pingable ip)) then
230-
failwith
231-
(Printf.sprintf "driver domain %s is not responding to IP ping"
232-
(Ref.string_of driver)
233-
) ;
234-
if not (wait_for (queryable ~__context (Xmlrpc_client.TCP (ip, 80)))) then
235-
failwith
236-
(Printf.sprintf "driver domain %s is not responding to XMLRPC query"
237-
(Ref.string_of driver)
238-
) ;
239-
ip
240-
241170
type service = {uuid: string; ty: string; instance: string; url: string}
242171
[@@deriving rpc]
243172

ocaml/xapi/system_domains.mli

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@ val is_in_use : __context:Context.t -> self:API.ref_VM -> bool
5151
val queryable : __context:Context.t -> Xmlrpc_client.transport -> unit -> bool
5252
(** [queryable ip port ()] returns true if [ip]:[port] responsds to an XMLRPC query *)
5353

54-
val ip_of : __context:Context.t -> API.ref_VM -> string
55-
(** [ip_of __context vm] returns the IP of the given VM on the internal management network *)
56-
5754
(** One of many service running in a driver domain *)
5855
type service = {uuid: string; ty: string; instance: string; url: string}
5956

ocaml/xapi/xapi.ml

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,9 @@ let synchronize_certificates_with_coordinator ~__context =
288288

289289
(* Make sure the local database can be read *)
290290
let init_local_database () =
291-
( try
292-
let (_ : string) = Localdb.get Constants.ha_armed in
293-
()
294-
with Localdb.Missing_key _ ->
295-
Localdb.put Constants.ha_armed "false" ;
296-
debug "%s = 'false' (by default)" Constants.ha_armed
291+
if Option.is_none (Localdb.get_bool Constants.ha_armed) then (
292+
Localdb.put Constants.ha_armed "false" ;
293+
debug "%s = 'false' (by default)" Constants.ha_armed
297294
) ;
298295
(* Add the local session check hook *)
299296
Session_check.check_local_session_hook :=
@@ -519,13 +516,14 @@ let start_ha () =
519516
(** Enable and load the redo log if we are the master, the local-DB flag is set
520517
* and HA is disabled *)
521518
let start_redo_log () =
519+
let redo_log_enabled () =
520+
Localdb.get_bool Constants.redo_log_enabled |> Option.value ~default:false
521+
in
522+
let ha_armed () =
523+
Localdb.get_bool Constants.ha_armed |> Option.value ~default:false
524+
in
522525
try
523-
if
524-
Pool_role.is_master ()
525-
&& bool_of_string
526-
(Localdb.get_with_default Constants.redo_log_enabled "false")
527-
&& not (bool_of_string (Localdb.get Constants.ha_armed))
528-
then (
526+
if Pool_role.is_master () && redo_log_enabled () && not (ha_armed ()) then (
529527
debug "Redo log was enabled when shutting down, so restarting it" ;
530528
Static_vdis.reattempt_on_boot_attach () ;
531529
(* enable the use of the redo log *)
@@ -610,7 +608,7 @@ let resynchronise_ha_state () =
610608
let pool = Helpers.get_pool ~__context in
611609
let pool_ha_enabled = Db.Pool.get_ha_enabled ~__context ~self:pool in
612610
let local_ha_enabled =
613-
bool_of_string (Localdb.get Constants.ha_armed)
611+
Localdb.get_bool Constants.ha_armed |> Option.value ~default:false
614612
in
615613
match (local_ha_enabled, pool_ha_enabled) with
616614
| true, true ->

ocaml/xapi/xapi_globs.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,7 @@ let migration_https_only = ref true
858858

859859
let cluster_stack_root = ref "/usr/libexec/xapi/cluster-stack"
860860

861-
let cluster_stack_default = ref "xhad"
861+
let cluster_stack_default = ref Constants.default_cluster_stack
862862

863863
let xen_cmdline_path = ref "/opt/xensource/libexec/xen-cmdline"
864864

0 commit comments

Comments
 (0)