Conversation
| parse_cli_args(["oom_monitor_report_period", Period|Rest], C) -> | ||
| try list_to_integer(Period) of | ||
| P when P >= 0 -> | ||
| parse_cli_args(Rest, C#config{ oom_monitor_report_period = P }); |
There was a problem hiding this comment.
Does this need a P * 1000?
|
|
||
| init([]) -> | ||
| ?LOG_INFO([{event, ar_oom_monitor_init}]), | ||
| %% Trap exit to ensure we close the file handle cleanly |
There was a problem hiding this comment.
Do we still need this? or is this boilerplate copied from other gen_servers?
We've been wrestling with the process_flag(trap_exist, true) as it was one of the contributors to the shutdown hang. I thing we had previously gotten into the habit of just adding it to all new processes by default (e.g. by copying it from an existing process). Just want to call it out here in case that's tha case. If the trap_exit is needed and not just boilerplate: then all good.
| case file:open(Filename, [write, raw, {encoding, utf8}]) of | ||
| {ok, FileHandle} -> | ||
| ?LOG_INFO([ | ||
| {event, ar_oom_monitor_started}, {report_period_ms, ReportPeriod}, {filename, Filename}]), |
There was a problem hiding this comment.
| {event, ar_oom_monitor_started}, {report_period_ms, ReportPeriod}, {filename, Filename}]), | |
| <tab><tab>{event, ar_oom_monitor_started}, {report_period_ms, ReportPeriod}, {filename, Filename}]), |
maybe? this line shows up as 8 spaces in my diff rather than 2 tabs
| Handle -> | ||
| file:close(Handle) | ||
| end, | ||
| ok. |
There was a problem hiding this comment.
Do we always want to return ok here even if file:close yields an error?
| get_top_binary_processes(TopProcs) -> | ||
| Procs = lists:map(fun(Pid) -> | ||
| case erlang:process_info(Pid, [binary, registered_name, initial_call, current_function]) of | ||
| undefined -> {Pid, 0, []}; | ||
| Props -> | ||
| TotalSize = lists:sum([Size || {_, Size, _} <- proplists:get_value(binary, Props, [])]), | ||
| InitialCall = proplists:get_value(initial_call, Props, {unknown, unknown, 0}), | ||
| CurrentFunction = proplists:get_value(current_function, Props, {unknown, unknown, 0}), | ||
| case proplists:get_value(registered_name, Props, undefined) of | ||
| undefined -> {Pid, TotalSize, [{initial_call, InitialCall}, {current_function, CurrentFunction}]}; | ||
| RegName -> {Pid, TotalSize, [RegName, {initial_call, InitialCall}, {current_function, CurrentFunction}]} | ||
| end | ||
| end | ||
| end, erlang:processes()), |
There was a problem hiding this comment.
Some spaces in here instead of tabs (haha, the new ChatGPT tell)
| AllocData = recon_alloc:memory(allocated_types), | ||
| maps:from_list([{Type, Size} || {Type, Size} <- AllocData]). | ||
|
|
||
| write_memory_report(FileHandle, MemoryReport, LastMemoryReport) -> |
This PR introduces the OOM monitor process.
In order to run it, you need to enable
oom_monitorflag in config:{ "enable": ["oom_monitor"] }Related configuration options are:
oom_monitor_report_period: time interval between reports in seconds; default is 5;oom_monitor_filename: filename to write reports into; default is/tmp/arweave_oom_monitor.log;oom_monitor_top_procs: number of top processes to sample for each category of the report; default is 20.Sample report: