Skip to content

Commit 31f0da3

Browse files
committed
Improve fabric streams cleanup on error and timeouts
Previously, we performed cleanup only for specific errors such as `ddoc_updated`, and `insufficient_storage`. In case of other errors, or timeouts, there was a chance we would leak workers waiting to be either started or canceled. Those workers would then wait around until the 5 minute rexi timeout fires, and then they emit an error in the logs. It's not a big deal as that happens on errors only, and the processes are all waiting in receive, however, they do hold a Db handle open, so they can waste resources from that point of view. To fix that, this commit extends cleanup to other errors and timeouts. Moreover, in case of timeouts, we log fabric worker timeout errors. In order to do that we export the `fabric_streams` internal `#stream_acc` record to every `fabric_streams` user. That's a bit untidy, so make the timeout error return the defunct workers only, and so, we can avoid leaking the `#stream_acc` record outside the fabric_streams module. Related to #5127
1 parent a2241d3 commit 31f0da3

File tree

7 files changed

+59
-59
lines changed

7 files changed

+59
-59
lines changed

src/couch_replicator/src/couch_replicator_fabric.erl

+2-8
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,8 @@ docs(DbName, Options, QueryArgs, Callback, Acc) ->
3434
after
3535
fabric_streams:cleanup(Workers)
3636
end;
37-
{timeout, NewState} ->
38-
DefunctWorkers = fabric_util:remove_done_workers(
39-
NewState#stream_acc.workers, waiting
40-
),
41-
fabric_util:log_timeout(
42-
DefunctWorkers,
43-
"replicator docs"
44-
),
37+
{timeout, DefunctWorkers} ->
38+
fabric_util:log_timeout(DefunctWorkers, "replicator docs"),
4539
Callback({error, timeout}, Acc);
4640
{error, Error} ->
4741
Callback({error, Error}, Acc)

src/fabric/include/fabric.hrl

-8
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,6 @@
3232
update_seq
3333
}).
3434

35-
-record(stream_acc, {
36-
workers,
37-
ready,
38-
start_fun,
39-
replacements,
40-
ring_opts
41-
}).
42-
4335
-record(view_row, {key, id, value, doc, worker}).
4436

4537
-type row_property_key() :: id | key | value | doc | worker.

src/fabric/src/fabric_streams.erl

+49-6
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@
2323
add_worker_to_cleaner/2
2424
]).
2525

26-
-include_lib("fabric/include/fabric.hrl").
2726
-include_lib("mem3/include/mem3.hrl").
2827

28+
-record(stream_acc, {
29+
workers,
30+
ready,
31+
start_fun,
32+
replacements,
33+
ring_opts
34+
}).
35+
2936
-define(WORKER_CLEANER, fabric_worker_cleaner).
3037

3138
% This is the streams equivalent of fabric_util:submit_jobs/4. Besides
@@ -77,7 +84,12 @@ start(Workers0, Keypos, StartFun, Replacements, RingOpts) ->
7784
Workers
7885
),
7986
{ok, AckedWorkers};
87+
{timeout, #stream_acc{workers = Defunct}} ->
88+
cleanup(Workers0),
89+
DefunctWorkers = fabric_util:remove_done_workers(Defunct, waiting),
90+
{timeout, DefunctWorkers};
8091
Else ->
92+
cleanup(Workers0),
8193
Else
8294
end.
8395

@@ -165,10 +177,7 @@ handle_stream_start(rexi_STREAM_INIT, {Worker, From}, St) ->
165177
{stop, St#stream_acc{workers = [], ready = Ready1}}
166178
end
167179
end;
168-
handle_stream_start({ok, Error}, _, St) when Error == ddoc_updated; Error == insufficient_storage ->
169-
WaitingWorkers = [W || {W, _} <- St#stream_acc.workers],
170-
ReadyWorkers = [W || {W, _} <- St#stream_acc.ready],
171-
cleanup(WaitingWorkers ++ ReadyWorkers),
180+
handle_stream_start({ok, Error}, _, _) when Error == ddoc_updated; Error == insufficient_storage ->
172181
{stop, Error};
173182
handle_stream_start(Else, _, _) ->
174183
exit({invalid_stream_start, Else}).
@@ -236,7 +245,9 @@ worker_cleaner_test_() ->
236245
?TDEF_FE(should_clean_additional_worker_too),
237246
?TDEF_FE(coordinator_is_killed_if_client_disconnects),
238247
?TDEF_FE(coordinator_is_not_killed_if_client_is_connected),
239-
?TDEF_FE(submit_jobs_sets_up_cleaner)
248+
?TDEF_FE(submit_jobs_sets_up_cleaner),
249+
?TDEF_FE(cleanup_called_on_timeout),
250+
?TDEF_FE(cleanup_called_on_error)
240251
]
241252
}
242253
}.
@@ -442,7 +453,39 @@ submit_jobs_sets_up_cleaner(_) ->
442453
?assert(is_process_alive(Cleaner))
443454
end.
444455

456+
cleanup_called_on_timeout(_) ->
457+
Ref1 = make_ref(),
458+
Ref2 = make_ref(),
459+
W1 = #shard{node = 'n1', ref = Ref1},
460+
W2 = #shard{node = 'n2', ref = Ref2},
461+
Workers = [W1, W2],
462+
meck:expect(rexi_utils, recv, fun(_, _, _, Acc, _, _) ->
463+
{timeout, Acc#stream_acc{workers = [{W2, waiting}]}}
464+
end),
465+
meck:reset(fabric_util),
466+
Res = start(Workers, #shard.ref, undefined, undefined, []),
467+
?assertEqual({timeout, [W2]}, Res),
468+
?assert(meck:called(fabric_util, cleanup, 1)).
469+
470+
cleanup_called_on_error(_) ->
471+
Ref1 = make_ref(),
472+
Ref2 = make_ref(),
473+
W1 = #shard{node = 'n1', ref = Ref1},
474+
W2 = #shard{node = 'n2', ref = Ref2},
475+
Workers = [W1, W2],
476+
meck:expect(rexi_utils, recv, fun(_, _, _, _, _, _) ->
477+
{error, foo}
478+
end),
479+
meck:reset(fabric_util),
480+
Res = start(Workers, #shard.ref, undefined, undefined, []),
481+
?assertEqual({error, foo}, Res),
482+
?assert(meck:called(fabric_util, cleanup, 1)).
483+
445484
setup() ->
485+
ok = meck:new(rexi_utils, [passthrough]),
486+
ok = meck:new(config, [passthrough]),
487+
ok = meck:new(fabric_util, [passthrough]),
488+
meck:expect(config, get, fun(_, _, Default) -> Default end),
446489
ok = meck:expect(rexi, kill_all, fun(_) -> ok end),
447490
% Speed up disconnect socket timeout for the test to 200 msec
448491
ok = meck:expect(chttpd_util, mochiweb_client_req_check_msec, 0, 200).

src/fabric/src/fabric_view_all_docs.erl

+2-8
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,8 @@ go(Db, Options, #mrargs{keys = undefined} = QueryArgs, Callback, Acc) ->
3737
after
3838
fabric_streams:cleanup(Workers)
3939
end;
40-
{timeout, NewState} ->
41-
DefunctWorkers = fabric_util:remove_done_workers(
42-
NewState#stream_acc.workers, waiting
43-
),
44-
fabric_util:log_timeout(
45-
DefunctWorkers,
46-
"all_docs"
47-
),
40+
{timeout, DefunctWorkers} ->
41+
fabric_util:log_timeout(DefunctWorkers, "all_docs"),
4842
Callback({error, timeout}, Acc);
4943
{error, Error} ->
5044
Callback({error, Error}, Acc)

src/fabric/src/fabric_view_changes.erl

+2-9
Original file line numberDiff line numberDiff line change
@@ -199,15 +199,8 @@ send_changes(DbName, ChangesArgs, Callback, PackedSeqs, AccIn, Timeout) ->
199199
after
200200
fabric_streams:cleanup(Workers)
201201
end;
202-
{timeout, NewState} ->
203-
DefunctWorkers = fabric_util:remove_done_workers(
204-
NewState#stream_acc.workers,
205-
waiting
206-
),
207-
fabric_util:log_timeout(
208-
DefunctWorkers,
209-
"changes"
210-
),
202+
{timeout, DefunctWorkers} ->
203+
fabric_util:log_timeout(DefunctWorkers, "changes"),
211204
throw({error, timeout});
212205
{error, Reason} ->
213206
throw({error, Reason});

src/fabric/src/fabric_view_map.erl

+2-10
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
-include_lib("fabric/include/fabric.hrl").
1818
-include_lib("mem3/include/mem3.hrl").
19-
-include_lib("couch/include/couch_db.hrl").
2019
-include_lib("couch_mrview/include/couch_mrview.hrl").
2120

2221
go(DbName, Options, GroupId, View, Args, Callback, Acc, VInfo) when
@@ -66,15 +65,8 @@ go(Db, Options, DDoc, View, Args0, Callback, Acc, VInfo) ->
6665
after
6766
fabric_streams:cleanup(Workers)
6867
end;
69-
{timeout, NewState} ->
70-
DefunctWorkers = fabric_util:remove_done_workers(
71-
NewState#stream_acc.workers,
72-
waiting
73-
),
74-
fabric_util:log_timeout(
75-
DefunctWorkers,
76-
"map_view"
77-
),
68+
{timeout, DefunctWorkers} ->
69+
fabric_util:log_timeout(DefunctWorkers, "map_view"),
7870
Callback({error, timeout}, Acc);
7971
{error, Error} ->
8072
Callback({error, Error}, Acc)

src/fabric/src/fabric_view_reduce.erl

+2-10
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
-include_lib("fabric/include/fabric.hrl").
1818
-include_lib("mem3/include/mem3.hrl").
19-
-include_lib("couch/include/couch_db.hrl").
2019
-include_lib("couch_mrview/include/couch_mrview.hrl").
2120

2221
go(DbName, GroupId, View, Args, Callback, Acc0, VInfo) when is_binary(GroupId) ->
@@ -55,15 +54,8 @@ go(Db, DDoc, VName, Args, Callback, Acc, VInfo) ->
5554
after
5655
fabric_streams:cleanup(Workers)
5756
end;
58-
{timeout, NewState} ->
59-
DefunctWorkers = fabric_util:remove_done_workers(
60-
NewState#stream_acc.workers,
61-
waiting
62-
),
63-
fabric_util:log_timeout(
64-
DefunctWorkers,
65-
"reduce_view"
66-
),
57+
{timeout, DefunctWorkers} ->
58+
fabric_util:log_timeout(DefunctWorkers, "reduce_view"),
6759
Callback({error, timeout}, Acc);
6860
{error, Error} ->
6961
Callback({error, Error}, Acc)

0 commit comments

Comments
 (0)