From 8049b855c482f061a001274e1489d71154cc158f Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Tue, 19 Nov 2019 13:08:57 -0700 Subject: [PATCH 01/43] stbtrace can't find input files for package installations, collection (#12) (#13) interval in seconds --- cmd/stbtrace.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cmd/stbtrace.py b/cmd/stbtrace.py index b4b7b40..b230351 100755 --- a/cmd/stbtrace.py +++ b/cmd/stbtrace.py @@ -48,8 +48,8 @@ help='A comma separated list of statistics to' 'collect') parser.add_argument('-c', '--coll', type=int, action='store', - dest='collection_ns', - help='The collection interval specified in ns') + dest='collection_sec', + help='The collection interval in seconds') parser.add_argument('-b', '--bcc', action='store_true', dest='bcc', help='Emit the bcc script without executing it') parser.add_argument('-p', '--path', action='store', dest='PATH', @@ -62,7 +62,7 @@ if args.PATH: filename = args.PATH + '/' + args.tracer + '.json' else: - filename = '../bpf/stbtrace/' + args.tracer + '.json' + filename = base_dir + '/bpf/stbtrace/' + args.tracer + '.json' try: with open(filename, 'r') as json_file: data = json.load(json_file) @@ -91,7 +91,7 @@ if args.PATH: filename = args.PATH + '/' + args.tracer + '.st' else: - filename = '../bpf/stbtrace/' + args.tracer + '.st' + filename = base_dir + '/bpf/stbtrace/' + args.tracer + '.st' try: with open(filename, 'r') as f: @@ -118,8 +118,9 @@ template.selectFields('maps', args.STATS.split(",")) template.selectFields('hists', args.STATS.split(",")) -if args.collection_ns: - template.addSingleton('collection_period_in_ns', args.collection_ns) +if args.collection_sec: + template.addSingleton('collection_period_in_ns', + args.collection_sec * 1000000000) else: for node in data: if node != 'keys' and \ From 247f1cc34ec0c186c59da7ca75e20237e1a9763c Mon Sep 17 00:00:00 2001 From: Prashanth Sreenivasa Date: Fri, 6 Dec 2019 08:35:30 -0800 Subject: [PATCH 02/43] Fix backend io collector post 5.x kernel upgrade (#18) (#20) * Fix backend io collector post 5.x kernel upgrade * Check for kprobe existence before attaching * Fix lint 1 --- bpf/stbtrace/io.st | 3 ++- cmd/estat.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 01ea4c5..8ad836d 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -128,7 +128,8 @@ int disk_io_done(struct pt_regs *ctx, struct request *reqp) """ b = BPF(text=bpf_text) -b.attach_kprobe(event="blk_start_request", fn_name="disk_io_start") +if BPF.get_kprobe_functions(b'blk_start_request'): + b.attach_kprobe(event="blk_start_request", fn_name="disk_io_start") b.attach_kprobe(event="blk_mq_start_request", fn_name="disk_io_start") b.attach_kprobe(event="blk_account_io_completion", fn_name="disk_io_done") diff --git a/cmd/estat.py b/cmd/estat.py index ef56665..cd05f61 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -412,8 +412,12 @@ class Args: line + "'") probe_type = probe_spec[0] if probe_type == "kprobe": - b.attach_kprobe(event=probe_spec[1], fn_name=probe_spec[2]) - probes.add("p_" + probe_spec[1] + "_bcc_" + str(os.getpid())) + if BPF.get_kprobe_functions(probe_spec[1]): + b.attach_kprobe(event=probe_spec[1], fn_name=probe_spec[2]) + probes.add("p_" + probe_spec[1] + "_bcc_" + str(os.getpid())) + else: + print("WARNING: {}: {} - not found" + .format(probe_type, probe_spec[1])) elif probe_type == "kretprobe": b.attach_kretprobe(event=probe_spec[1], fn_name=probe_spec[2], maxactive=MAXACTIVE) From b8e0e493f89c5c5c4e2c3217ec227a180a79b41b Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Sat, 18 Jan 2020 00:27:22 +0000 Subject: [PATCH 03/43] port txg dtrace script to ebpf closes #30 --- bpf/standalone/txg.py | 378 ++++++++++++++++++++++++++++++++++++++++++ cmd/estat.py | 26 ++- 2 files changed, 396 insertions(+), 8 deletions(-) create mode 100755 bpf/standalone/txg.py diff --git a/bpf/standalone/txg.py b/bpf/standalone/txg.py new file mode 100755 index 0000000..5ca0262 --- /dev/null +++ b/bpf/standalone/txg.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# +# Copyright 2019 Delphix. All rights reserved. +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# +# This script prints out information about every spa_sync() for the domain0 +# zpool. This allows us to find a point in the past where I/O performance +# degraded and some high-level symptoms about why it's slower now than it was, +# which may be enough to correlate the change to a configuration / +# environmental change. +# + +from bcc import BPF, PerfType, PerfSWConfig +import argparse +import os +import drgn +from datetime import datetime + +# define BPF program +bpf_text = """ +#include +#include +#include +#include +#include +#include +""" +parser = argparse.ArgumentParser( + description='Collect spa_sync statistics for each txg.', + usage='estat txg [options]') +parser.add_argument('-c', '--coll', type=int, action='store', + dest='collection_sec', + help='The collection interval in seconds') +parser.add_argument('-p', '--pool', type=str, action='store', + dest='pool', + help='The pool to monitor (default: domain0)') +args = parser.parse_args() + +if (args.pool): + bpf_text += '#define POOL "' + str(args.pool) + '"' +else: + bpf_text += '#define POOL "domain0"' + +bpf_text += """ +typedef struct { + spa_t *spa; + u64 start; + u64 stop; + u64 time_since_last_sync; + u64 pool_sync_enter; + u64 elapsed_pass1; + u64 delay_entry_time; + u64 txg; + u64 throttle; + u64 delay_max; + u64 delay_sum; + u64 delay_count; +} spa_sync_info; + +typedef struct { + u64 txg; + u64 time_since_last_sync; + u64 sync_time; + u64 elapsed_pass1; + u64 dirty_b; + u64 throttle; + u64 delay_max; + u64 delay_sum; + u64 delay_count; +} spa_sync_data; + +typedef struct { + u64 *dirty_addr; + u64 dirty_max; +} spa_sync_dirty; + +#define SPA_SYNC_DIRTY_INDEX 0; + +BPF_HASH(sync_info_map, u32, spa_sync_info); +BPF_ARRAY(sync_dirty_map, spa_sync_dirty, 1); +BPF_PERF_OUTPUT(sync_events); + +/* + * Store the spa_t address if the pool name matches POOL, + * "domain0" by default. Should only have to do the string + * comparision once. + */ + +static inline void pool_set(spa_sync_info *info, spa_t *spa) +{ + char comparand[sizeof(POOL)]; + bpf_probe_read(&comparand, sizeof(comparand), spa->spa_name); + char compare[] = POOL; + for (int i = 0; i < sizeof(comparand); ++i) + if (compare[i] != comparand[i]) + return; + info->spa = spa; +} + +/* + * Get the domain0 spa object and store "start" time and + * "time since last sync". + */ +int spa_sync_entry(struct pt_regs *ctx, spa_t *spa, uint64_t txg) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL) { + spa_sync_info spa_info = {}; + sync_info_map.insert(&tid, &spa_info); + info = &spa_info; + + int dirty_index = SPA_SYNC_DIRTY_INDEX; + spa_sync_dirty *spa_dirty; + spa_dirty = sync_dirty_map.lookup(&dirty_index); + if (spa_dirty != NULL) { + spa_dirty->dirty_addr = &spa->spa_dsl_pool->dp_dirty_total; + spa_dirty->dirty_max = spa->spa_dsl_pool->dp_dirty_total; + } + } + + if (info->spa == 0) + pool_set(info, spa); + + if (info->spa != spa) + return 0; + + info->start = bpf_ktime_get_ns(); + info->txg = txg; + /* + * We may not have a "stop" time yet, so just print a (kind of bogus) + * zero value for "time since last sync" if that's the case. This only + * affects the first txg we see -- after that "stop" will be set by the + * previous txg. + */ + info->time_since_last_sync = info->stop != 0 ? + info->start - info->stop : 0; + info->pool_sync_enter = 0; + info->delay_entry_time = 0; + info->delay_max = 0; + return 0; +} + +/* + * Collect data required to know the percentage of syncing time spent + * in pass 1. + */ +int dsl_pool_sync_entry(struct pt_regs *ctx, dsl_pool_t *dp) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->spa != dp->dp_spa || + info->spa->spa_sync_pass != 1) { + return 0; + } + + info->pool_sync_enter = bpf_ktime_get_ns(); + return 0; +} + +int dsl_pool_sync_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->pool_sync_enter == 0) { + return 0; + } + + info->elapsed_pass1 = bpf_ktime_get_ns() - info->pool_sync_enter; + return 0; +} + +/* + * Collect data to know how much we're being throttled / delayed. If we're + * throttled on every tx we could hit these probes a lot (burning CPU), so + * processing in the probe should be kept to a minimum. + */ +int dmu_tx_delay_entry(struct pt_regs *ctx, dmu_tx_t *tx, uint64_t dirty) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->spa != tx->tx_pool->dp_spa) { + return 0; + } + + info->delay_entry_time = bpf_ktime_get_ns(); + return 0; +} +int dmu_tx_delay_mintime(struct pt_regs *ctx, dmu_tx_t *tx, uint64_t dirty, + uint64_t min_tx_time) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->delay_entry_time == 0) { + return 0; + } + + if (min_tx_time > info->throttle) { + info->throttle = min_tx_time; + } + return 0; +} + +int dmu_tx_delay_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->delay_entry_time == 0) { + return 0; + } + + u64 elapsed = bpf_ktime_get_ns() - info->delay_entry_time; + if (elapsed > info->delay_max) { + info->delay_max = elapsed; + } + + info->delay_count = info->delay_count + 1; + info->delay_sum = info->delay_sum + elapsed; + return 0; +} + +/* + * Submit an event containing the collected stats for each + * completed spa sync and reset counters. + */ +int spa_sync_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + spa_sync_info *info = sync_info_map.lookup(&tid); + if (info == NULL || info->pool_sync_enter == 0) { + return 0; + } + info->stop = bpf_ktime_get_ns(); + + + spa_sync_data data = {}; + data.txg = info->txg; + data.time_since_last_sync = info->time_since_last_sync; + data.sync_time = info->stop - info->start; + data.elapsed_pass1 = info->elapsed_pass1; + + data.throttle = info->throttle; + data.delay_max = info->delay_max; + data.delay_sum = info->delay_sum; + data.delay_count = info->delay_count; + + int dirty_index = SPA_SYNC_DIRTY_INDEX; + spa_sync_dirty *dirty = sync_dirty_map.lookup(&dirty_index); + if (dirty == NULL) { + data.dirty_b = info->spa->spa_dsl_pool->dp_dirty_total; + } else { + data.dirty_b = dirty->dirty_max; + dirty->dirty_max = 0; + } + + sync_events.perf_submit(ctx, &data, sizeof(data)); + info->throttle = 0; + info->start = 0; + info->spa = 0; + + return 0; +} + +/* + * Record how much dirty data we've collected so far. The value doesn't need to + * be exact, so we just check this periodically. + */ +int get_spa_dirty(struct bpf_perf_event_data *ctx) { + u32 tid = bpf_get_current_pid_tgid(); + int dirty_index = SPA_SYNC_DIRTY_INDEX; + spa_sync_dirty *dirty = sync_dirty_map.lookup(&dirty_index); + if (dirty == NULL) { + return 0; + } + + if (*dirty->dirty_addr > dirty->dirty_max) { + dirty->dirty_max = *dirty->dirty_addr; + } + + return 0; +} +""" + + +def read_zfs_dirty_max_data(): + """ Use the drgn program object to read the + zfs_dirty_data_max kernel variable. + """ + global proj + variable = prog['zfs_dirty_data_max'] + return int(variable.value_()) + + +def print_event(cpu, data, size): + """ Print the raw txg data and a legend at the start and + after every 30 events. + """ + event_format = ('{:<24} {:>10} {:>5}ms {:>5}ms ({:>2} pass 1)' + '{:>4}MB ({:>2}) {:>4}us {:>5}ms {:>4}ms') + global print_count + if (print_count == 30): + print(" date txg time" + " since last sync") + print(" | | |" + " sync time") + print(" | | |" + " | (%% pass 1)") + print(" | | |" + " | | highest dirty (%%) ") + print(" | | |" + " | | | highest throttle delay") + print(" | | |" + " | | | | | avg delay") + print(" v v v" + " v v v v v v") + # + # Here is an example line: + # "Fri Jan 17 21:21:16 2020 1827162 0ms 1342ms (60% p1) \ + # 208MB (10%) 0us 57ms 10ms" + # + print_count = 0 + print_count = print_count + 1 + + zfs_dirty_max_data = read_zfs_dirty_max_data() + event = b["sync_events"].event(data) + date = datetime.now() + average_delay = 0 + if (event.delay_count > 1): + average_delay = event.delay_sum / event.delay_count + print(event_format.format(date.ctime(), + event.txg, + int(event.time_since_last_sync / 1000 / 1000), + int(event.sync_time / 1000 / 1000), + int(event.elapsed_pass1 * 100 / event.sync_time), + int(event.dirty_b / 1024 / 1024), + int(event.dirty_b * 100 / zfs_dirty_max_data), + int((event.throttle + 999) / 1000), + event.delay_max, + average_delay)) + + +# load BPF program +KVER = os.popen('uname -r').read().rstrip() +b = BPF(text=bpf_text, + cflags=["-include", + "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-I/usr/src/zfs-" + KVER + "/include/", + "-I/usr/src/zfs-" + KVER + "/include/spl", + "-I/usr/src/zfs-" + KVER + "/include/", + "-I/usr/src/zfs-" + KVER + "/include/linux", + "-DCC_USING_FENTRY"]) + +b.attach_kprobe(event="spa_sync", fn_name="spa_sync_entry") +b.attach_kretprobe(event="spa_sync", fn_name="spa_sync_return") +b.attach_kprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_entry") +b.attach_kretprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_return") +b.attach_kprobe(event="dmu_tx_delay", fn_name="dmu_tx_delay_entry") +b.attach_kprobe(event="trace_zfs_delay__mintime", + fn_name="dmu_tx_delay_mintime") +b.attach_perf_event(ev_type=PerfType.SOFTWARE, + ev_config=PerfSWConfig.CPU_CLOCK, + fn_name="get_spa_dirty", + sample_freq=10) + +print_count = 30 + +# initialize dgrn program object to read zfs_dirty_data_max +prog = drgn.program_from_kernel() + +# loop with callback to print_event +b["sync_events"].open_perf_buffer(print_event) +while 1: + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/cmd/estat.py b/cmd/estat.py index cd05f61..212f781 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -65,21 +65,23 @@ def die(*args, **kwargs): help_msg += " {} [prog] [options]\n".format(sys.argv[0]) help_msg += """ Tool for running eBPF programs for monitoring performance of various I/O - related subsystems, focusing on I/O latency. Output can be displayed has + related subsystems, focusing on I/O latency. Output can be displayed with histograms of I/O broken down by size and type, depending on the provided arguments. OPTIONS: -h show this help message and exit + + The subcommands listed in the following section share common options. + Each executes the specified tracing program for seconds, + displaying various histograms detailing the I/O latency as seen by the + particular subsystem. """ for program in programs: help_msg += "\n estat {:<13} [options] ".format(program) help_msg += """ - Executes the specified tracing program for seconds, displaying - various histograms detailing the I/O latency as seen by the particular - subsystem. -m monitor mode; emit data periodically -M monitor mode; emit accumulated data periodically @@ -93,12 +95,20 @@ def die(*args, **kwargs): -d LEVEL set BCC debug level -e emit the resulting eBPF script without executing it - estat zil [options] + The subcommands listed in the following section are stand alone tracers. + Each has it's own options as detailed below. + + estat txg [options] + Collect spa_sync statistics for each txg. + -h show txg help message and exit + -c INTERVAL set the collection interval in seconds + -p POOL set the pool to monitor (default: domain0) + + estat zil [POOL] Provides a breakdown of time spent doing ZIL-related activities, in particular the time spent allocating a block and time spent waiting for - the write I/O to complete. - - -a pool the pool to trace + the write I/O to complete. If POOL is not specified, defaults to tracing + the pool 'domain0'. """ From a4646a41b99ea486c16452ea14b6ff8fe3a292ae Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Tue, 24 Mar 2020 09:57:33 -0700 Subject: [PATCH 04/43] DLPX-69040 [Backport of DLPX-68396 to 6.0.2.0] port metaslab_alloc dtrace script to ebpf (#38) --- bpf/estat/metaslab-alloc.c | 141 +++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 bpf/estat/metaslab-alloc.c diff --git a/bpf/estat/metaslab-alloc.c b/bpf/estat/metaslab-alloc.c new file mode 100644 index 0000000..a413d39 --- /dev/null +++ b/bpf/estat/metaslab-alloc.c @@ -0,0 +1,141 @@ +/* + * Copyright 2020 Delphix. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include +#include +#include +#include + +#define VD_NAME_SIZE 32 +typedef struct { + u64 ts; + u64 size; + u64 asize; + u64 alloc_time; + char vd_name[VD_NAME_SIZE]; +} data_t; + +BPF_HASH(data_map, u32, data_t); + +#ifndef OPTARG +#define POOL "domain0" +#else +#define POOL (OPTARG) +#endif + +static inline bool +equal_to_pool(char *str) +{ + char comparand[sizeof (POOL)]; + bpf_probe_read(&comparand, sizeof (comparand), str); + char compare[] = POOL; + + for (int i = 0; i < sizeof (comparand); ++i) + if (compare[i] != comparand[i]) + return (false); + + return (true); +} + +// @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry +int +metaslab_alloc_dva_entry(struct pt_regs *ctx, + spa_t *spa, metaslab_class_t *mc, uint64_t psize) +{ + u32 tid = bpf_get_current_pid_tgid(); + data_t data = {}; + + if (!equal_to_pool(spa->spa_name)) + return (0); + + data.ts = bpf_ktime_get_ns(); + data.size = psize; + + data_map.update(&tid, &data); + + return (0); +} + +// @@ kprobe|metaslab_group_alloc|metaslab_group_alloc_entry +int +metaslab_group_alloc_entry(struct pt_regs *ctx, + metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize) +{ + u32 tid = bpf_get_current_pid_tgid(); + data_t *data = data_map.lookup(&tid); + + if (data == NULL || data->ts == 0) + return (0); + + data->asize = asize; + data->alloc_time = bpf_ktime_get_ns(); + + if (mg->mg_vd->vdev_path != NULL) { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_path); + } else { + bpf_probe_read_str(data->vd_name, + sizeof(data->vd_name), mg->mg_vd->vdev_ops->vdev_op_type); + } + + return (0); +} + +// @@ kretprobe|metaslab_group_alloc|metaslab_group_alloc_exit +int +metaslab_group_alloc_exit(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + data_t *data = data_map.lookup(&tid); + char failure[] = "failure"; + char success[] = "success"; + char *axis; + + if (data == NULL || data->ts == 0) + return (0); + + if (PT_REGS_RC(ctx) == -1ULL) { + axis = failure; + } else { + axis = success; + } + + AGGREGATE_DATA(data->vd_name, axis, + bpf_ktime_get_ns() - data->ts, data->asize); + + data->asize = 0; + data->alloc_time = 0; + data->vd_name[0] = '\0'; + + return (0); +} + +// @@ kretprobe|metaslab_alloc_dva|metaslab_alloc_dva_exit +int +metaslab_alloc_dva_exit(struct pt_regs *ctx, + spa_t *spa, metaslab_class_t *mc, uint64_t psize) +{ + u32 tid = bpf_get_current_pid_tgid(); + data_t *data = data_map.lookup(&tid); + + if (data == NULL || data->ts == 0) + return (0); + + if (PT_REGS_RC(ctx) == 0) + return (0); + + char name[] = "allocation failures"; + char axis = 0; + AGGREGATE_DATA(name, &axis, + bpf_ktime_get_ns() - data->ts, data->size); + + data->ts = 0; + data->size = 0; + + data_map.delete(&tid); + + return (0); +} From f19c981bde6eacaecf43c7be8a64bdab59e198e2 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Thu, 9 Apr 2020 13:18:39 -0700 Subject: [PATCH 05/43] DLPX-69162 [Backport of DLPX-69120 6.0.2.0] running estat against zvol fails to find zvol_impl.h on AWS (#42) --- bpf/estat/zvol.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/bpf/estat/zvol.c b/bpf/estat/zvol.c index 457d10b..d799e3d 100644 --- a/bpf/estat/zvol.c +++ b/bpf/estat/zvol.c @@ -11,8 +11,8 @@ #include #include #include -#include - +#include +#include #define ZVOL_WCE 0x8 #define ZVOL_READ 1 @@ -28,6 +28,31 @@ #define POOL (OPTARG) #endif +/* + * Copied and duplicated from "module/zfs/zvol.c" of the ZFS repository. + */ +typedef struct zvol_state { + char zv_name[MAXNAMELEN]; /* name */ + uint64_t zv_volsize; /* advertised space */ + uint64_t zv_volblocksize; /* volume block size */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_flags; /* ZVOL_* flags */ + uint32_t zv_open_count; /* open counts */ + uint32_t zv_changed; /* disk changed */ + zilog_t *zv_zilog; /* ZIL handle */ + rangelock_t zv_rangelock; /* for range locking */ + dnode_t *zv_dn; /* dnode hold */ + dev_t zv_dev; /* device id */ + struct gendisk *zv_disk; /* generic disk */ + struct request_queue *zv_queue; /* request queue */ + dataset_kstats_t zv_kstat; /* zvol kstats */ + list_node_t zv_next; /* next zvol_state_t linkage */ + uint64_t zv_hash; /* name hash */ + struct hlist_node zv_hlink; /* hash link */ + kmutex_t zv_state_lock; /* protects zvol_state_t */ + atomic_t zv_suspend_ref; /* refcount for suspend */ + krwlock_t zv_suspend_lock; /* suspend lock */ +} zvol_state_t; // Structure to hold thread local data typedef struct { From e7b1156d08306f0157ea8ee2482b63a4f8adc2d7 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 11 May 2020 14:54:06 -0600 Subject: [PATCH 06/43] stbtrace should have nonzero exit code on failure [Backport of #17 to 6.0.3.0] (#40) --- cmd/stbtrace.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/stbtrace.py b/cmd/stbtrace.py index b230351..e41cdc1 100755 --- a/cmd/stbtrace.py +++ b/cmd/stbtrace.py @@ -133,8 +133,12 @@ script = template.render() if args.bcc: print (script) + exit(0) else: try: exec(script) - except BaseException as e: + except KeyboardInterrupt: + exit(0) + except Exception as e: print (e) + exit(1) From fff5d5701f8a5f63b9a3b8bfdb7a397cd5cbb62b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 11 May 2020 16:31:08 -0600 Subject: [PATCH 07/43] commas in the output of keys in estat [Backport of #22 to 6.0.3.0] (#41) estat: remove mandatory final positional duration argument estat: too many arguments output when there are too few --- bpf/standalone/zil.py | 33 ++++++++++++++++++++++++++++++--- lib/bcchelper.py | 3 +++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 0589e80..14003d9 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -22,6 +22,7 @@ from bcc import BPF from time import sleep, strftime +import argparse import sys import os repo_lib_dir = os.path.dirname(__file__) + "/../../lib/" @@ -46,8 +47,19 @@ #include """ -if len(sys.argv) > 1: - bpf_text += '#define POOL "' + sys.argv[1] + '"' +parser = argparse.ArgumentParser( + description='Collect zil latency statistics.', + usage='estat zil [options]') +parser.add_argument('-c', '--coll', type=int, action='store', + dest='collection_sec', + help='The collection interval in seconds') +parser.add_argument('-p', '--pool', type=str, action='store', + dest='pool', + help='The pool to monitor (default: domain0)') +args = parser.parse_args() + +if (args.pool): + bpf_text += '#define POOL "' + str(args.pool) + '"' else: bpf_text += '#define POOL "domain0"' @@ -276,7 +288,22 @@ BCCHelper.AVERAGE_AGGREGATION, "avg") alloc_helper.add_key_type("name") -print(" Tracing enabled... Hit Ctrl-C to end.") +if (not args.collection_sec): + print(" Tracing enabled... Hit Ctrl-C to end.") + +# Collect data for a collection interval if specified +if (args.collection_sec): + sleep(args.collection_sec) + try: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + latency_helper.printall() + alloc_helper.printall() + exit(0) + except Exception as e: + print(str(e)) + exit(0) + +# Collect data until keyborad interrupt with output for each second while (1): try: sleep(60) diff --git a/lib/bcchelper.py b/lib/bcchelper.py index 6690c61..1b54300 100755 --- a/lib/bcchelper.py +++ b/lib/bcchelper.py @@ -571,6 +571,9 @@ def items_to_string(self, agg_items): stripped_key = "" try: stripped_key = keystr[keystr.index(',')+1:len(keystr)-1] + stripped_key = stripped_key.strip() + if stripped_key[-1:] == ",": + stripped_key = stripped_key[:-1] except ValueError: pass From 9cf16a2134f5276a1bb6a2abdd9c1efe011a86d6 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 14 May 2020 13:45:29 -0600 Subject: [PATCH 08/43] Compilation failures due to fentry issue [Backport of #33 to 6.0.3.0] (#47) Co-authored-by: Brad Lewis --- bpf/standalone/zil.py | 3 ++- bpf/stbtrace/zio.st | 3 ++- bpf/stbtrace/zpl.st | 3 ++- cmd/estat.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 14003d9..46d9ec8 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -260,7 +260,8 @@ "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/linux"]) + "-I/usr/src/zfs-" + KVER + "/include/linux", + "-DCC_USING_FENTRY"]) b.attach_kprobe(event="zfs_write", fn_name="zfs_write_entry") b.attach_kretprobe(event="zfs_write", fn_name="zfs_write_return") diff --git a/bpf/stbtrace/zio.st b/bpf/stbtrace/zio.st index 7248439..57babe0 100644 --- a/bpf/stbtrace/zio.st +++ b/bpf/stbtrace/zio.st @@ -132,7 +132,8 @@ b = BPF(text=bpf_text, cflags=["-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl/", - "-I/usr/src/zfs-" + KVER + "/include/linux"]) + "-I/usr/src/zfs-" + KVER + "/include/linux", + "-DCC_USING_FENTRY"]) b.attach_kretprobe(event="vdev_queue_io_to_issue", fn_name="vdev_queue_issue_return") b.attach_kprobe(event="vdev_queue_io_done", fn_name="vdev_queue_done") diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index b1b8828..22ec052 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -173,7 +173,8 @@ int zfs_write_done(struct pt_regs *ctx) """ KVER = os.popen('uname -r').read().rstrip() b = BPF(text=bpf_text, - cflags=["-I/usr/src/zfs-" + KVER + "/include/spl"]) + cflags=["-I/usr/src/zfs-" + KVER + "/include/spl/", + "-DCC_USING_FENTRY"]) b.attach_kprobe(event="zfs_read", fn_name="zfs_read_start") b.attach_kprobe(event="zfs_write", fn_name="zfs_write_start") diff --git a/cmd/estat.py b/cmd/estat.py index 212f781..642b140 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -394,7 +394,8 @@ class Args: cflags = ["-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/spl"] + "-I/usr/src/zfs-" + KVER + "/include/spl", + "-DCC_USING_FENTRY"] if script_arg: cflags.append("-DOPTARG=\"" + script_arg + "\"") From 607e52ffdd072dad60d281c79ad2b5242cf169c9 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Tue, 19 May 2020 16:24:30 -0600 Subject: [PATCH 09/43] Switch to Python 3 [Backport of #9 to 6.0.3.0] (#43) Co-authored-by: John Gallagher --- .github/scripts/flake8_stbtrace.sh | 33 ++++++++++++++++++++++++++++++ .github/workflows/flake8.yml | 4 ++-- bpf/standalone/zil.py | 4 ++-- bpf/stbtrace/io.st | 8 +++----- bpf/stbtrace/iscsi.st | 11 +++++----- bpf/stbtrace/nfs.st | 12 +++++------ bpf/stbtrace/vfs.st | 10 ++++----- bpf/stbtrace/zio.st | 11 +++++----- bpf/stbtrace/zpl.st | 10 ++++----- cmd/estat.py | 26 +++++++++++------------ cmd/stbtrace.py | 28 ++++++++++++------------- debian/control | 2 +- lib/bcchelper.py | 31 +++++++++++++++------------- 13 files changed, 108 insertions(+), 82 deletions(-) create mode 100755 .github/scripts/flake8_stbtrace.sh diff --git a/.github/scripts/flake8_stbtrace.sh b/.github/scripts/flake8_stbtrace.sh new file mode 100755 index 0000000..53181dd --- /dev/null +++ b/.github/scripts/flake8_stbtrace.sh @@ -0,0 +1,33 @@ +#/bin/bash +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# +# SPDX-License-Identifier: GPL-2.0-or-later +# + +# +# This provides flake8 linting for the various stbtrace templates. It uses +# stbtrace with the '--bcc' flag to produce and print Python code, and then +# runs flake8 against that. +# + +set -o pipefail + +function die() { + echo "$(basename "$0"): $*" >&2 + exit 1 +} + +prgms=$(find ./bpf/stbtrace -name '*.st' | xargs basename --suffix='.st') || + die "Failed to generate list of stbtrace programs" + +for prgm in $prgms; do + echo "Checking stbtrace program $prgm..." + ./cmd/stbtrace.py $prgm --bcc >/tmp/$prgm.py || + die "Failed to generate python source" + flake8 /tmp/$prgm.py --show-source || die "flake8 errors found in" \ + "'bpf/stbtrace/$prgm.st'. Line numbers in error messages" \ + "correspond to the output of './cmd/stbtrace.py $prgm --bcc'" +done + +exit 0 diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml index 4a75f47..9c2572d 100644 --- a/.github/workflows/flake8.yml +++ b/.github/workflows/flake8.yml @@ -5,15 +5,15 @@ on: [pull_request, push] jobs: build: runs-on: ubuntu-latest - steps: - uses: actions/checkout@v1 - name: Set up Python uses: actions/setup-python@v1 with: - python-version: 2.7 + python-version: 3.6 - name: flake8 run: | python -m pip install --upgrade pip pip install flake8 flake8 . --show-source --statistics + ./.github/scripts/flake8_stbtrace.sh diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 46d9ec8..0052972 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # # Copyright 2019 Delphix. All rights reserved. # @@ -305,7 +305,7 @@ exit(0) # Collect data until keyborad interrupt with output for each second -while (1): +while True: try: sleep(60) except KeyboardInterrupt: diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 8ad836d..0dde504 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,7 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 # BPF disk io program @@ -125,7 +123,7 @@ int disk_io_done(struct pt_regs *ctx, struct request *reqp) io_base_data.delete((u64 *) &reqp); return 0; } -""" +""" # noqa: W293 b = BPF(text=bpf_text) if BPF.get_kprobe_functions(b'blk_start_request'): diff --git a/bpf/stbtrace/iscsi.st b/bpf/stbtrace/iscsi.st index b84caf0..b5f08ad 100755 --- a/bpf/stbtrace/iscsi.st +++ b/bpf/stbtrace/iscsi.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,7 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 # BPF txg program bpf_text = """ @@ -63,7 +61,8 @@ BPF_HASH($hist.name$, iscsi_hist_key_t, u64); }$ // Probe functions to initialize thread local data -int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) +int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, + struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) { iscsi_data_t data = {}; data.ts = bpf_ktime_get_ns(); @@ -122,7 +121,7 @@ int iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) return 0; } -""" +""" # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="iscsit_process_scsi_cmd", fn_name="iscsi_target_start") diff --git a/bpf/stbtrace/nfs.st b/bpf/stbtrace/nfs.st index cabb893..9671007 100755 --- a/bpf/stbtrace/nfs.st +++ b/bpf/stbtrace/nfs.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,7 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 # BPF txg program bpf_text = """ @@ -167,8 +165,8 @@ int nfsd4_read_start(struct pt_regs *ctx, struct svc_rqst *rqstp, void *cstate, return 0; } -int nfsd4_write_start(struct pt_regs *ctx, struct svc_rqst *rqstp, void *cstate, - nfsd4_write *nfs_write) +int nfsd4_write_start(struct pt_regs *ctx, struct svc_rqst *rqstp, + void *cstate, nfsd4_write *nfs_write) { u32 pid = bpf_get_current_pid_tgid(); nfs_data_t data = {}; @@ -302,7 +300,7 @@ int nfsd4_write_done(struct pt_regs *ctx) return 0; } -""" +""" # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="nfsd_read", fn_name="nfsd3_read_start") diff --git a/bpf/stbtrace/vfs.st b/bpf/stbtrace/vfs.st index af1c941..038eae2 100644 --- a/bpf/stbtrace/vfs.st +++ b/bpf/stbtrace/vfs.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,7 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 # BPF txg program bpf_text = """ @@ -68,7 +66,7 @@ BPF_HASH($hist.name$, vfs_hist_key_t, u64); // Probe functions to initialize thread local data int vfs_start(struct pt_regs *ctx, void *file, void *user_buf, - u64 count, void *ppos) + u64 count, void *ppos) { u32 pid = bpf_get_current_pid_tgid(); vfs_data_t data = {}; @@ -187,7 +185,7 @@ int vfs_write_done(struct pt_regs *ctx) u64 ts = bpf_ktime_get_ns(); return vfs_aggregate_data(ts, WRITE_STR); } -""" +""" # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="vfs_read", fn_name="vfs_start") diff --git a/bpf/stbtrace/zio.st b/bpf/stbtrace/zio.st index 57babe0..a01ab4e 100644 --- a/bpf/stbtrace/zio.st +++ b/bpf/stbtrace/zio.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,7 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 # BPF txg program bpf_text = """ @@ -125,7 +123,7 @@ int vdev_queue_done(struct pt_regs *ctx, zio_t *zio) } -""" +""" # noqa: W293 KVER = os.popen('uname -r').read().rstrip() b = BPF(text=bpf_text, cflags=["-include", @@ -135,7 +133,8 @@ b = BPF(text=bpf_text, cflags=["-include", "-I/usr/src/zfs-" + KVER + "/include/linux", "-DCC_USING_FENTRY"]) -b.attach_kretprobe(event="vdev_queue_io_to_issue", fn_name="vdev_queue_issue_return") +b.attach_kretprobe(event="vdev_queue_io_to_issue", + fn_name="vdev_queue_issue_return") b.attach_kprobe(event="vdev_queue_io_done", fn_name="vdev_queue_done") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index 22ec052..80191e5 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -6,11 +6,9 @@ from bcc import BPF from time import sleep -from time import time -import datetime -import ctypes as ct import os import sys + # # Find BCCHelper. If we are being run from the repo, we should be able to find # it in the repo's lib/ directory. If we can't find that, look for BCCHelper @@ -20,7 +18,8 @@ base_dir = os.path.dirname(__file__) + "/../" if not os.path.exists(base_dir + "lib/bcchelper.py"): base_dir = "/usr/share/performance-diagnostics/" sys.path.append(base_dir + 'lib/') -from bcchelper import BCCHelper +from bcchelper import BCCHelper # noqa: E402 + # BPF txg program bpf_text = """ @@ -170,7 +169,8 @@ int zfs_write_done(struct pt_regs *ctx) return zpl_aggregate_data(ts, WRITE_STR); } -""" +""" # noqa: W293 + KVER = os.popen('uname -r').read().rstrip() b = BPF(text=bpf_text, cflags=["-I/usr/src/zfs-" + KVER + "/include/spl/", diff --git a/cmd/estat.py b/cmd/estat.py index 642b140..c18ec3e 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # # Copyright 2019 Delphix. All rights reserved. # @@ -19,8 +19,6 @@ # the performance data collector. # -from __future__ import print_function - from bcc import BPF import getopt from glob import glob @@ -423,7 +421,7 @@ class Args: line + "'") probe_type = probe_spec[0] if probe_type == "kprobe": - if BPF.get_kprobe_functions(probe_spec[1]): + if BPF.get_kprobe_functions(probe_spec[1].encode('utf-8')): b.attach_kprobe(event=probe_spec[1], fn_name=probe_spec[2]) probes.add("p_" + probe_spec[1] + "_bcc_" + str(os.getpid())) else: @@ -481,20 +479,20 @@ class Args: # output if monitor: # TODO can we do this without shelling out to 'date'? - ds__start = long(os.popen("date +%s%N").readlines()[0]) + ds__start = int(os.popen("date +%s%N").readlines()[0]) while (1): try: sleep(duration) except KeyboardInterrupt: break try: - ds__end = long(os.popen("date +%s%N").readlines()[0]) + ds__end = int(os.popen("date +%s%N").readlines()[0]) ds__delta = ds__end - ds__start if not accum: ds__start = ds__end if args.summary and args.normalize: - helper1.normalize("ops", ds__delta / 1000000000) - helper3.normalize("opst", ds__delta / 1000000000) + helper1.normalize("ops", ds__delta // 1000000000) + helper3.normalize("opst", ds__delta // 1000000000) clear_data = not accum if args.latsize_hist: helper2.printall(clear_data) @@ -503,26 +501,26 @@ class Args: if args.summary and args.total: helper3.printall(clear_data) print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) - except e: + except Exception as e: die(e) else: - ds__start = long(os.popen("date +%s%N").readlines()[0]) + ds__start = int(os.popen("date +%s%N").readlines()[0]) try: sleep(duration) except KeyboardInterrupt: pass try: - ds__delta = long(os.popen("date +%s%N").readlines()[0]) - ds__start + ds__delta = int(os.popen("date +%s%N").readlines()[0]) - ds__start if args.summary and args.normalize: - helper1.normalize("ops", ds__delta / 1000000000) - helper3.normalize("opst", ds__delta / 1000000000) + helper1.normalize("ops", ds__delta // 1000000000) + helper3.normalize("opst", ds__delta // 1000000000) if args.latsize_hist: helper2.printall() if args.lat_hist or args.size_hist or args.summary: helper1.printall() if args.summary and args.total: helper3.printall() - except e: + except Exception as e: die(e) # diff --git a/cmd/stbtrace.py b/cmd/stbtrace.py index e41cdc1..19373f4 100755 --- a/cmd/stbtrace.py +++ b/cmd/stbtrace.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # # Copyright 2019 Delphix. All rights reserved. # @@ -45,7 +45,7 @@ help='A comma separated list specifying collection ' 'axis or "None"') parser.add_argument('-s', '--stats', action='store', dest='STATS', - help='A comma separated list of statistics to' + help='A comma separated list of statistics to ' 'collect') parser.add_argument('-c', '--coll', type=int, action='store', dest='collection_sec', @@ -67,22 +67,22 @@ with open(filename, 'r') as json_file: data = json.load(json_file) except IOError as e: - print ("Error reading " + filename + ": " + e.strerror) + print("Error reading " + filename + ": " + e.strerror) exit() -except ValueError as e: - print ("Error reading " + filename + ": invalid format") +except ValueError: + print("Error reading " + filename + ": invalid format") exit() if args.fields: - print (args.tracer + " fields") - print ("axes:") + print(args.tracer + " fields") + print("axes:") for key in data['keys'].keys(): - print (" " + key) - print ("statistics:") + print(" " + key) + print("statistics:") for map in data['maps'].keys(): - print (" " + map) + print(" " + map) for hist in data['hists'].keys(): - print (" " + hist) + print(" " + hist) exit() # @@ -97,7 +97,7 @@ with open(filename, 'r') as f: file_content = f.read() except IOError as e: - print ("Error reading " + filename + ": " + e.strerror) + print("Error reading " + filename + ": " + e.strerror) exit() template = IterativeTemplate(file_content) @@ -132,7 +132,7 @@ # Perform template substitutions and execute or output the resulting script script = template.render() if args.bcc: - print (script) + print(script, end='') exit(0) else: try: @@ -140,5 +140,5 @@ except KeyboardInterrupt: exit(0) except Exception as e: - print (e) + print(e) exit(1) diff --git a/debian/control b/debian/control index 0f5806d..b03fda7 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python-bcc, python3-bcc, python-minimal +Depends: python3-bcc, python-minimal Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/lib/bcchelper.py b/lib/bcchelper.py index 1b54300..211b848 100755 --- a/lib/bcchelper.py +++ b/lib/bcchelper.py @@ -229,7 +229,7 @@ def apply_normalization(self, name, value): """Apply normalization if specified.""" for n in self.normalize_aggregations: if n[0] == name: - value = value / n[1] + value = value // n[1] return value def add_key_type(self, name, type=None): @@ -265,7 +265,7 @@ def log_lin_hist_value(self, slot): """ mag_values = [10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000] - idx = slot / 10 + idx = slot // 10 if idx >= len(mag_values): self.logger.debug("log_lin_hist_value slot: %d out of range", slot) return -1 @@ -296,7 +296,7 @@ def log_hist_value_str(slot): def get_ampersand_string(value, total): retstr = "|" for i in range(((value-1) * - BCCHelper.HISTOGRAM_AMPERSAND_WIDTH / total) + 1): + BCCHelper.HISTOGRAM_AMPERSAND_WIDTH // total) + 1): retstr += "@" return retstr @@ -324,11 +324,11 @@ def histogram_entry(self, hist_type, slot, value, total): # using log_lin_hist_slot(), so convert to microseconds # if (slot > 0): - rangestr += str(self.log_lin_hist_value(slot - 1)/1000) + rangestr += str(self.log_lin_hist_value(slot - 1)//1000) else: rangestr += "0" rangestr += ", " - rangestr += str(self.log_lin_hist_value(slot)/1000) + ")" + rangestr += str(self.log_lin_hist_value(slot)//1000) + ")" return "\n" + self.ESTAT_HIST_FORMAT.format( rangestr, str(value), BCCHelper.get_ampersand_string(value, total)) @@ -348,7 +348,7 @@ def output_histogram(self, hist_items, hist_type): h = "" - total = 0L + total = 0 if self.mode != self.ANALYTICS_PRINT_MODE: h += '\n' h += self.ESTAT_HIST_FORMAT.format( @@ -359,13 +359,13 @@ def output_histogram(self, hist_items, hist_type): hist_items.sort(key=lambda x: x[0].slot) slot = -1 - value = 0L + value = 0 for k, v in hist_items: if slot != k.slot and slot != -1: h += self.histogram_entry(hist_type, slot, value, total) if self.mode == self.ANALYTICS_PRINT_MODE: h += "," - value = 0L + value = 0 slot = k.slot value += v.value h += self.histogram_entry(hist_type, slot, value, total) @@ -399,7 +399,11 @@ def key_type_string(self, key, key_type): DEFAULT_KEY_TYPE - no post-processing needed IP_KEY_TYPE - strips unwanted symbols from IPV4 literal address """ - keystr = str(getattr(key, key_type[0])) + attr = getattr(key, key_type[0]) + if type(attr) is bytes: + keystr = attr.decode('utf-8', errors='backslashreplace') + else: + keystr = str(attr) # Extract IPv4 literal address, often preceded by "*," or "," if key_type[1] == self.IP_KEY_TYPE: @@ -422,7 +426,7 @@ def combined_average(self, agg_items): count += item[1].count if count: - return sum / count + return sum // count else: return 0 @@ -525,12 +529,12 @@ def combine_items(self, agg, items, count_val, aversum_val): com_val = self.combined_scalar_value(items) if agg[1] == self.AVERSUM_AGGREGATION: if count_val: - com_val = com_val / count_val + com_val = com_val // count_val else: com_val = 0 elif agg[1] == self.STDDEV_AGGREGATION: if count_val and aversum_val: - com_val = com_val / count_val - ( + com_val = com_val // count_val - ( aversum_val * aversum_val) else: com_val = 0 @@ -645,5 +649,4 @@ def printall(self, clear=True): """ Print and clear all data from aggregations in the helper.""" agg_items = self.get_ordered_items(clear) outstr = self.items_to_string(agg_items) - # The trailing comma prevents the print from adding a newline. - print outstr, + print(outstr, end='', flush=True) From 8e746faa05597b40637e34b551ee91d8b65f9118 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Fri, 22 May 2020 14:31:29 -0600 Subject: [PATCH 10/43] [Backport of DLPX-68397 to 6.0.3.0] need nfs threads to replace dtrace script (#48) --- cmd/nfs_threads.py | 207 +++++++++++++++++++++++++++++++++++++++++++++ debian/control | 2 +- debian/rules | 1 + 3 files changed, 209 insertions(+), 1 deletion(-) create mode 100755 cmd/nfs_threads.py diff --git a/cmd/nfs_threads.py b/cmd/nfs_threads.py new file mode 100755 index 0000000..674c6d8 --- /dev/null +++ b/cmd/nfs_threads.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# +# SPDX-License-Identifier: GPL-2.0-or-later +# + +''' +Display NFS thread usage info along with NFS I/O context. + +Output Sample: + + packets sockets threads threads metadata read read write write + arrived enqueued woken used calls iops thruput iops thruput + 4589 0 4589 25 16 273 3.6MB 212 2.6MB + 4735 0 4735 8 1 287 3.8MB 212 2.7MB + 4693 0 4693 10 0 280 3.7MB 216 2.7MB + 4625 0 4625 15 0 278 3.7MB 212 2.6MB + 4687 0 4687 7 1 285 3.8MB 210 2.6MB + 4701 0 4701 12 0 285 3.8MB 215 2.7MB +''' + +import psutil +from signal import signal, SIGINT +import sys +from time import sleep + +POOL_STATS = "/proc/fs/nfsd/pool_stats" +NFSD_STATS = "/proc/net/rpc/nfsd" + +H1 = ['packets', 'sockets', 'threads', 'threads', 'metadata', 'read', + 'read', 'write', 'write'] +H2 = ['arrived', 'enqueued', 'woken', 'used', 'calls', 'iops', 'thruput', + 'iops', 'thruput'] + +INTERVAL = 5 + + +def server_stopped(message=''): + print("NFS Server Stopped {}".format(message)) + sys.exit() + + +def print_header(header): + for col in header: + print('{0:>10}'.format(col), end='') + print() + + +def pool_stats(): + try: + with open(POOL_STATS, "r") as file: + for line in file: + if not line.startswith("#"): + fields = line.split(" ") + packets = int(fields[1]) + enqueued = int(fields[2]) + woken = int(fields[3]) + timedout = int(fields[4]) + return packets, enqueued, woken, timedout + except OSError: + server_stopped() + + +def nfs_stats(): + try: + metadata = 0 + readops = 0 + writeops = 0 + with open(NFSD_STATS, "r") as file: + for line in file: + if line.startswith("io"): + fields = line.split(" ") + readbytes = int(fields[1]) + writebytes = int(fields[2]) + if line.startswith("proc3"): + fields = line.split(" ") + readops += int(fields[8]) + writeops += int(fields[9]) + metadata += int(fields[3]) + metadata += int(fields[4]) + metadata += int(fields[5]) + metadata += int(fields[6]) + metadata += int(fields[20]) + metadata += int(fields[21]) + metadata += int(fields[22]) + if line.startswith("proc4ops"): + fields = line.split(" ") + readops += int(fields[27]) + writeops += int(fields[40]) + metadata += int(fields[5]) + metadata += int(fields[8]) + metadata += int(fields[11]) + metadata += int(fields[17]) + metadata += int(fields[36]) + return readbytes, writebytes, readops, writeops, metadata + except OSError: + server_stopped() + + +def context_switches(pids): + "Return a list of context switches per process in pids" + ls = [] + for pid in pids: + try: + pctxsw = psutil.Process(pid).num_ctx_switches() + ls.append(pctxsw.voluntary + pctxsw.involuntary) + except psutil.NoSuchProcess: + server_stopped() + return ls + + +def nfsd_processes(): + "Return a list of nfsd proceses" + ls = [] + for p in psutil.process_iter(attrs=['name', 'pid', 'uids']): + if p.info['name'] == "nfsd" and p.info['uids'].real == 0: + ls.append(p.info['pid']) + return ls + + +def print_value(value): + print('{0:>10}'.format(value), end='') + + +def print_thruput(value): + if value > 1073741824: + print('{0:>8}GB'.format(round(value / 1073741824, 1)), end='') + elif value > 1048576: + print('{0:>8}MB'.format(round(value / 1048576, 1)), end='') + else: + print('{0:>8}KB'.format(int(value / 1024)), end='') + + +def print_line(): + pids = nfsd_processes() + + prevSwitches = context_switches(pids) + prevPackets, prevEnqueued, prevWoken, prevTimedout = pool_stats() + prevRB, prevWB, prevRO, prevWO, prevMeta = nfs_stats() + + while(not sleep(INTERVAL)): + nextSwitches = context_switches(pids) + nextPackets, nextEnqueued, nextWoken, nextTimedout = pool_stats() + nextRB, nextWB, nextRO, nextWO, nextMeta = nfs_stats() + + threads = 0 + for i in range(0, len(prevSwitches)): + if not prevSwitches[i] == nextSwitches[i]: + threads += 1 + threads -= nextTimedout - prevTimedout + prevSwitches = nextSwitches.copy() + + # + # The published 'sockets-enqueued' value needs adjustment + # + enqueued = (nextEnqueued - prevEnqueued) - (nextWoken - prevWoken) + + # + # For IOPS values less than 10 display with decimal + # + readOps = (nextRO - prevRO) / INTERVAL + writeOps = (nextWO - prevWO) / INTERVAL + readOps = int(readOps) if readOps > 9 else round(readOps, 1) + writeOps = int(writeOps) if writeOps > 9 else round(writeOps, 1) + + # + # The read/write values are published as a 32-bit + # value so account for it to wrap in the interval + # + if nextRB < prevRB: + prevRB = 0 + if nextWB < prevWB: + prevWB = 0 + + print_value(nextPackets - prevPackets) + print_value(enqueued) + print_value(nextWoken - prevWoken) + print_value(threads) + print_value(nextMeta - prevMeta) + print_value(readOps) + print_thruput((nextRB - prevRB) / INTERVAL) + print_value(writeOps) + print_thruput((nextWB - prevWB) / INTERVAL) + print() + + prevPackets = nextPackets + prevEnqueued = nextEnqueued + prevWoken = nextWoken + prevTimedout = nextTimedout + prevMeta = nextMeta + prevRB = nextRB + prevWB = nextWB + prevRO = nextRO + prevWO = nextWO + + +def handler(signal_received, frame): + print() + sys.exit(0) + + +signal(SIGINT, handler) + +print_header(H1) +print_header(H2) +print_line() diff --git a/debian/control b/debian/control index b03fda7..f50a22d 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python-minimal +Depends: python3-bcc, python-minimal, python3-psutil Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. diff --git a/debian/rules b/debian/rules index cfec6fa..d1dbb9a 100755 --- a/debian/rules +++ b/debian/rules @@ -16,6 +16,7 @@ override_dh_auto_build: mkdir -p build/cmd/ cp cmd/estat.py build/cmd/estat cp cmd/stbtrace.py build/cmd/stbtrace + cp cmd/nfs_threads.py build/cmd/nfs_threads override_dh_auto_install: dh_install build/cmd/* /usr/bin From c27002d0b037acd76b62f3ad97038c1761bf276b Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 6 Dec 2019 11:28:41 -0700 Subject: [PATCH 11/43] Fix ZPL Collectors after ZFS changes to sync flags and include files (#21) --- bpf/estat/zpl.c | 2 +- bpf/stbtrace/zpl.st | 7 ++++++- cmd/estat.py | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/bpf/estat/zpl.c b/bpf/estat/zpl.c index f3d88ac..09e3b8e 100644 --- a/bpf/estat/zpl.c +++ b/bpf/estat/zpl.c @@ -57,7 +57,7 @@ zfs_read_write_entry(io_info_t *info, struct inode *ip, uio_t *uio, int flags) info->start_time = bpf_ktime_get_ns(); info->bytes = uio->uio_resid; info->is_sync = - z_os->os_sync == ZFS_SYNC_ALWAYS || (flags & (FSYNC | FDSYNC)); + z_os->os_sync == ZFS_SYNC_ALWAYS || (flags & (O_SYNC | O_DSYNC)); u32 tid = bpf_get_current_pid_tgid(); io_info_t *infop = io_info_map.lookup(&tid); diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index 80191e5..41df731 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -173,7 +173,12 @@ int zfs_write_done(struct pt_regs *ctx) KVER = os.popen('uname -r').read().rstrip() b = BPF(text=bpf_text, - cflags=["-I/usr/src/zfs-" + KVER + "/include/spl/", + cflags=["-include", + "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-include", + "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", + "-I/usr/src/zfs-" + KVER + "/include/", + "-I/usr/src/zfs-" + KVER + "/include/spl/", "-DCC_USING_FENTRY"]) b.attach_kprobe(event="zfs_read", fn_name="zfs_read_start") diff --git a/cmd/estat.py b/cmd/estat.py index c18ec3e..4a7f070 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -391,6 +391,8 @@ class Args: KVER = os.popen('uname -r').read().rstrip() cflags = ["-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-include", + "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl", "-DCC_USING_FENTRY"] From c817b300eb31747d02f57f33e58b7295eefda77f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 10 Jan 2020 12:50:52 -0800 Subject: [PATCH 12/43] Fixes for estat usage and arg parsing error messages (#26) * Fix usage message for zil program * Fix arg parsing error message --- cmd/estat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmd/estat.py b/cmd/estat.py index 4a7f070..ea89a89 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -202,7 +202,10 @@ class Args: else: assert False, "unhandled option: " + opt -if len(rem_args) != 1: +if len(rem_args) == 0: + die("Missing duration argument") + +if len(rem_args) > 1: die("Too many arguments") try: From e57bac9dc382f918dac4150b13299e68239b15ed Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 10 Jan 2020 15:52:54 -0800 Subject: [PATCH 13/43] Use copy of bcc_helper from performance-diagnositics --- bpf/stbtrace/io.st | 7 +++---- bpf/stbtrace/iscsi.st | 6 +++--- bpf/stbtrace/nfs.st | 6 +++--- bpf/stbtrace/vfs.st | 6 +++--- bpf/stbtrace/zio.st | 6 +++--- bpf/stbtrace/zpl.st | 6 +++--- lib/bcchelper.py | 4 ++-- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 0dde504..62b19b6 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -22,14 +22,13 @@ from bcchelper import BCCHelper # noqa: E402 # BPF disk io program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" - // Definitions for this script #define READ_STR "read" diff --git a/bpf/stbtrace/iscsi.st b/bpf/stbtrace/iscsi.st index b5f08ad..bb17237 100755 --- a/bpf/stbtrace/iscsi.st +++ b/bpf/stbtrace/iscsi.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -21,11 +21,11 @@ sys.path.append(base_dir + 'lib/') from bcchelper import BCCHelper # noqa: E402 # BPF txg program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" #include "target/iscsi/iscsi_target_core.h" diff --git a/bpf/stbtrace/nfs.st b/bpf/stbtrace/nfs.st index 9671007..066e23a 100755 --- a/bpf/stbtrace/nfs.st +++ b/bpf/stbtrace/nfs.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -21,12 +21,12 @@ sys.path.append(base_dir + 'lib/') from bcchelper import BCCHelper # noqa: E402 # BPF txg program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" // nfsd4 definitions from fs/nfsd/xdr4.h diff --git a/bpf/stbtrace/vfs.st b/bpf/stbtrace/vfs.st index 038eae2..693e0d9 100644 --- a/bpf/stbtrace/vfs.st +++ b/bpf/stbtrace/vfs.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -21,11 +21,11 @@ sys.path.append(base_dir + 'lib/') from bcchelper import BCCHelper # noqa: E402 # BPF txg program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" // Definitions for this script #define READ_STR "read" diff --git a/bpf/stbtrace/zio.st b/bpf/stbtrace/zio.st index a01ab4e..d4115a8 100644 --- a/bpf/stbtrace/zio.st +++ b/bpf/stbtrace/zio.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -21,11 +21,11 @@ sys.path.append(base_dir + 'lib/') from bcchelper import BCCHelper # noqa: E402 # BPF txg program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" #include #include diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index 41df731..a22e91f 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -1,5 +1,5 @@ # -# Copyright (c) 2019 by Delphix. All rights reserved. +# Copyright (c) 2019, 2020 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -22,11 +22,11 @@ from bcchelper import BCCHelper # noqa: E402 # BPF txg program -bpf_text = """ +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include -#include "/opt/delphix/server/etc/bcc_helper.h" #include diff --git a/lib/bcchelper.py b/lib/bcchelper.py index 211b848..2e109fd 100755 --- a/lib/bcchelper.py +++ b/lib/bcchelper.py @@ -1,5 +1,5 @@ # -# Copyright 2018, 2019 Delphix. All rights reserved. +# Copyright 2018, 2020 Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -16,7 +16,7 @@ https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#1-bpf Defitions for C helper routines and macros are in -/opt/delphix/server/etc/bcc_helper.h +lib/bcc_helper.h BCC Helper focuses are printing out data from a set of tracing aggregations. There are three supported scalar types of aggregations(count, sum, and From d80a17d942c07e1d22da86c9a296f834ed84eb99 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 13 Jan 2020 17:34:36 -0800 Subject: [PATCH 14/43] ZIL fixes --- bpf/standalone/zil.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 0052972..6db4666 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -25,16 +25,15 @@ import argparse import sys import os -repo_lib_dir = os.path.dirname(__file__) + "/../../lib/" -if os.path.exists(repo_lib_dir + "bcchelper.py"): - sys.path.append(repo_lib_dir) -else: - sys.path.append("/usr/share/performance-diagnostics/lib/") -from bcchelper import BCCHelper # noqa +base_dir = os.path.dirname(__file__) + "/../../" +if not os.path.exists(base_dir + "lib/bcchelper.py"): + base_dir = "/usr/share/performance-diagnostics/" +sys.path.append(base_dir + 'lib/') +from bcchelper import BCCHelper # noqa: E402 # define BPF program -bpf_text = """ -#include "/opt/delphix/server/etc/bcc_helper.h" +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ #include #include #include @@ -132,7 +131,7 @@ return 0; info.alloc_count = 0; - info.sync = ioflag & (FSYNC | FDSYNC) || + info.sync = ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS; zil_info_map.update(&tid, &info); return 0; From aba6e7fc026aff711bc61d46eb76c3d4b23d34d0 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 3 Jan 2020 02:17:15 +0000 Subject: [PATCH 15/43] Throughput values are not normalized closes #23 --- cmd/estat.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cmd/estat.py b/cmd/estat.py index ea89a89..25e61d1 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -88,7 +88,6 @@ def die(*args, **kwargs): -z/-Z enable/disable size histograms (default: off) -q/-Q enable/disable latency histograms by size (default: off) -y/-Y enable/disable the summary output (default: on) - -n/-N enable/disable normalizing summary by time (default: on) -t/-T enable/disable emitting the summary total (default: on) -d LEVEL set BCC debug level -e emit the resulting eBPF script without executing it @@ -155,7 +154,6 @@ class Args: setattr(args, "size_hist", False) setattr(args, "latsize_hist", False) setattr(args, "summary", True) -setattr(args, "normalize", True) setattr(args, "total", True) # @@ -193,7 +191,6 @@ class Args: '-z': "size_hist", '-q': "latsize_hist", '-y': "summary", - '-n': "normalize", '-t': "total"} if opt in switches: setattr(args, switches[opt], True) @@ -495,9 +492,10 @@ class Args: ds__delta = ds__end - ds__start if not accum: ds__start = ds__end - if args.summary and args.normalize: - helper1.normalize("ops", ds__delta // 1000000000) - helper3.normalize("opst", ds__delta // 1000000000) + helper1.normalize("ops", ds__delta // 1000000000) + helper1.normalize("data", ds__delta // 1000000000) + helper3.normalize("opst", ds__delta // 1000000000) + helper3.normalize("datat", ds__delta // 1000000000) clear_data = not accum if args.latsize_hist: helper2.printall(clear_data) @@ -516,9 +514,10 @@ class Args: pass try: ds__delta = int(os.popen("date +%s%N").readlines()[0]) - ds__start - if args.summary and args.normalize: - helper1.normalize("ops", ds__delta // 1000000000) - helper3.normalize("opst", ds__delta // 1000000000) + helper1.normalize("ops", ds__delta // 1000000000) + helper1.normalize("data", ds__delta // 1000000000) + helper3.normalize("opst", ds__delta // 1000000000) + helper3.normalize("datat", ds__delta // 1000000000) if args.latsize_hist: helper2.printall() if args.lat_hist or args.size_hist or args.summary: From 7835d1f6b531441e07012a98182091b99ab1aeef Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 20 Feb 2020 14:17:40 -0800 Subject: [PATCH 16/43] Invalid mem errors when using latest BCC --- bpf/estat/backend-io.c | 7 +++---- bpf/stbtrace/io.st | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bpf/estat/backend-io.c b/bpf/estat/backend-io.c index ba998f9..3ddf679 100644 --- a/bpf/estat/backend-io.c +++ b/bpf/estat/backend-io.c @@ -15,9 +15,8 @@ #define READ_STR "read " #define WRITE_STR "write " #define OP_NAME_LEN 7 -#define DEV_NAME_LEN 8 #define NAME_LENGTH (OP_NAME_LEN + 1) -#define AXIS_LENGTH (DEV_NAME_LEN + 1) +#define AXIS_LENGTH (DISK_NAME_LEN + 1) // Structure to hold thread local data typedef struct { @@ -25,7 +24,7 @@ typedef struct { unsigned int size; unsigned int cmd_flags; u32 err; - char device[DEV_NAME_LEN]; + char device[DISK_NAME_LEN]; } io_data_t; BPF_HASH(io_base_data, u64, io_data_t); @@ -40,7 +39,7 @@ disk_io_start(struct pt_regs *ctx, struct request *reqp) data.ts = bpf_ktime_get_ns(); data.cmd_flags = reqp->cmd_flags; data.size = reqp->__data_len; - __builtin_memcpy(&data.device, diskp->disk_name, DEV_NAME_LEN); + bpf_probe_read_str(&data.device, DISK_NAME_LEN, diskp->disk_name); io_base_data.update((u64 *) &reqp, &data); return (0); } diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 62b19b6..266957a 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -69,7 +69,7 @@ int disk_io_start(struct pt_regs *ctx, struct request *reqp) data.ts = bpf_ktime_get_ns(); data.cmd_flags = reqp->cmd_flags; data.size = reqp->__data_len; - __builtin_memcpy(&data.device, diskp->disk_name, DISK_NAME_LEN); + bpf_probe_read_str(&data.device, DISK_NAME_LEN, diskp->disk_name); io_base_data.update((u64 *) &reqp, &data); return 0; } From 1e722e3bdb52a00e17560f391661611687e2f5cc Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Thu, 12 Mar 2020 22:07:55 -0600 Subject: [PATCH 17/43] Add arc_prefetch script. (#34) --- bpf/standalone/arc_prefetch.py | 317 +++++++++++++++++++++++++++++++++ lib/bcc_helper.h | 8 + lib/bcchelper.py | 104 ++++++++++- 3 files changed, 425 insertions(+), 4 deletions(-) create mode 100755 bpf/standalone/arc_prefetch.py diff --git a/bpf/standalone/arc_prefetch.py b/bpf/standalone/arc_prefetch.py new file mode 100755 index 0000000..768f7b1 --- /dev/null +++ b/bpf/standalone/arc_prefetch.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, 2016 by Delphix. All rights reserved. +# + +# +# This script provides read latency data for prefetch I/O. +# +# usage: arc-prefetch.d +# + +from bcc import BPF +from time import sleep +from time import strftime +import argparse +import os +import sys + +# +# Find BCCHelper. If we are being run from the repo, we should be able to find +# it in the repo's lib/ directory. If we can't find that, look for BCCHelper +# in its install location. +# +base_dir = os.path.dirname(__file__) + "/../../" +if not os.path.exists(base_dir + "lib/bcchelper.py"): + base_dir = "/usr/share/performance-diagnostics/" +sys.path.append(base_dir + 'lib/') +# flake8 wants these at the top of the file, but path update must come first +from bcchelper import BCCHelper # noqa: E402 +from bcchelper import BCCMapIndex # noqa: E402 +from bcchelper import BCCPerCPUIntArray # noqa: E402 +from bcchelper import BCCPoolCompare # noqa: E402 + + +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +""" +parser = argparse.ArgumentParser( + description='Collect arc_prefetch statistics.', + usage='estat arc-prefetch [options]') +parser.add_argument('-p', '--pool', type=str, action='store', + dest='pool', + help='The pool to monitor (default: domain0)') +args = parser.parse_args() + +# Add pool POOL_COMPARE macro to the bpf_text C code +if (args.pool): + pool = args.pool +else: + pool = "domain0" +pool_compare = BCCPoolCompare(pool) +if not pool_compare.get_pool_pointer(): + print("Warning: No pool filtering, unable to find zfs pool " + pool) +bpf_text += pool_compare.get_pool_compare_code() + +bpf_text += """ +#define ARC_FLAG_PREFETCH (1 << 2) /* I/O is a prefetch */ + +typedef struct { + u64 zfs_read_ts; + u64 arc_read_ts; + zio_t *zio; +}arc_prefetch_info_t; + +typedef struct { + u64 t; + u32 index; + u64 cpuid; +} lat_key; + +HIST_KEY(hist_lat_key, lat_key); + +BPF_HASH(arc_prefetch_info, u32, arc_prefetch_info_t); +BPF_HASH(zio_read_exit_time, zio_t *, u64); + +BPF_HASH(read_latency, hist_lat_key, u64); +BPF_HASH(read_average, lat_key, average_t); +BPF_PERCPU_ARRAY(arc_count, u32, NCOUNT_INDEX); + +int zfs_read_entry(struct pt_regs *ctx, struct inode *ip) +{ + u32 tid = bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + arc_prefetch_info_t info = {ts}; + + // filter by pool + zfsvfs_t *zfsvfs = ip->i_sb->s_fs_info; + objset_t *z_os = zfsvfs->z_os; + spa_t *spa = z_os->os_spa; + if (POOL_COMPARE(spa)) + arc_prefetch_info.update(&tid, &info); + + return 0; +} + +int zfs_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + + if (info == NULL) + return 0; + + if (info != NULL) + arc_prefetch_info.delete(&tid); + return 0; +} + + +int arc_hit_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + arc_count.increment(ARC_HIT_COUNT); + return 0; +} + +int arc_miss_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + arc_count.increment(ARC_MISS_COUNT); + return 0; +} + +int arc_read_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + info->arc_read_ts = bpf_ktime_get_ns(); + return 0; +} + +int arc_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->arc_read_ts == 0) + return 0; + + u64 elapsed = (bpf_ktime_get_ns() - info->arc_read_ts) / 1000; + lat_key lkey = {1, ARC_ISSUE_LATENCY, 0}; + lkey.cpuid = bpf_get_smp_processor_id(); + u32 slot = bpf_log2l(elapsed); + HIST_KEY_INITIALIZE(hist_lat_key, hkey, lkey, slot); + read_latency.increment(hkey); + + average_t *average = read_average.lookup(&lkey); + if (average == NULL) { + average_t initial_average = {1, elapsed}; + read_average.update(&lkey, &initial_average); + return 0; + } + average->count += 1; + average->sum += elapsed; + return 0; +} + +int zio_read_entry(struct pt_regs *ctx, zio_t *zio) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->arc_read_ts == 0) + return 0; + + info->zio = zio; + return 0; +} + +int zio_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + u64 zio_exit_ts = bpf_ktime_get_ns(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zio == NULL) + return 0; + + zio_read_exit_time.update(&info->zio, &zio_exit_ts); + return 0; +} + +int arc_read_done_entry(struct pt_regs *ctx, zio_t *zio) +{ + arc_buf_hdr_t *hdr = (arc_buf_hdr_t *) zio->io_private; + u64 zero = 0; + + u64 *zio_exit_ts = zio_read_exit_time.lookup(&zio); + if (zio_exit_ts == NULL || *zio_exit_ts == 0) { + return 0; + } + + u64 zio_ts = *zio_exit_ts; + u64 elapsed = (bpf_ktime_get_ns() - zio_ts) / 1000; + + hist_lat_key hkey = { }; + lat_key *lkey = HIST_KEY_GET_AGGKEY(&hkey); + u32 count_index; + if (hdr->b_flags & ARC_FLAG_PREFETCH) { + count_index = ARC_PREFETCH_ZIO_COUNT; + lkey->index = ARC_PREFETCH_ZIO_LATENCY; + } else { // normal zio + count_index = ARC_NORMAL_ZIO_COUNT; + lkey->index = ARC_NORMAL_ZIO_LATENCY; + } + lkey->t = 1; + lkey->cpuid = bpf_get_smp_processor_id(); + + HIST_KEY_SET_SLOT(&hkey, bpf_log2(elapsed)); + arc_count.increment(count_index); + read_latency.increment(hkey); + zio_read_exit_time.update(&zio, &zero); + average_t *average = read_average.lookup(lkey); + if (average == NULL) { + average_t initial_average = {1, elapsed}; + read_average.update(lkey, &initial_average); + return 0; + } + average->count += 1; + average->sum += elapsed; + + return 0; + } + +""" + + +class ArcCountIndex(BCCMapIndex): + ARC_HIT_COUNT = (0, 'ARC_HIT_COUNT', 'prefetch hit count') + ARC_MISS_COUNT = (1, 'ARC_MISS_COUNT', 'prefetch miss count') + ARC_NORMAL_ZIO_COUNT = (2, 'ARC_NORMAL_ZIO_COUNT', 'normal read count') + ARC_PREFETCH_ZIO_COUNT = (3, 'ARC_PREFETCH_ZIO_COUNT', + 'prefetch read count') + + +class ArcLatencyIndex(BCCMapIndex): + ARC_ISSUE_LATENCY = (0, 'ARC_ISSUE_LATENCY', 'arc read latency') + ARC_NORMAL_ZIO_AVERAGE = (1, 'ARC_NORMAL_ZIO_LATENCY', + 'normal read latency') + ARC_PREFETCH_ZIO_AVERAGE = (2, 'ARC_PREFETCH_ZIO_LATENCY', + 'prefetch read latency') + + +KVER = os.popen('uname -r').read().rstrip() + +flags = ["-include", + "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-include", + "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", + "-I/usr/src/zfs-" + KVER + "/include/", + "-I/usr/src/zfs-" + KVER + "/include/spl/", + "-I/usr/src/zfs-" + KVER + "/include/linux", + "-DCC_USING_FENTRY", + "-DNCOUNT_INDEX=" + str(len(ArcCountIndex)), + "-DNAVERAGE_INDEX=" + str(len(ArcLatencyIndex))] \ + + ArcCountIndex.getCDefinitions() \ + + ArcLatencyIndex.getCDefinitions() + +b = BPF(text=bpf_text, cflags=flags) + +b.attach_kprobe(event="zfs_read", fn_name="zfs_read_entry") +b.attach_kretprobe(event="zfs_read", fn_name="zfs_read_return") +b.attach_kprobe(event="trace_zfs_arc__hit", fn_name="arc_hit_entry") +b.attach_kprobe(event="trace_zfs_arc__miss", fn_name="arc_miss_entry") +b.attach_kprobe(event="arc_read", fn_name="arc_read_entry") +b.attach_kretprobe(event="arc_read", fn_name="arc_read_return") +b.attach_kprobe(event="zio_read", fn_name="zio_read_entry") +b.attach_kretprobe(event="zio_read", fn_name="zio_read_return") +b.attach_kprobe(event="arc_read_done", fn_name="arc_read_done_entry") + +read_latency_helper = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) +read_latency_helper.add_aggregation("read_latency", + BCCHelper.LOG_HISTOGRAM_AGGREGATION, + "microseconds") +read_latency_helper.add_aggregation("read_average", + BCCHelper.AVERAGE_AGGREGATION, + "avg latency(us)") +read_latency_helper.add_key_type("index", BCCHelper.MAP_INDEX_TYPE, + ArcLatencyIndex) +call_count_helper = BCCPerCPUIntArray(b, "arc_count", ArcCountIndex) + + +while True: + try: + sleep(5) + except KeyboardInterrupt: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + call_count_helper.printall() + read_latency_helper.printall() + break + try: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + call_count_helper.printall() + read_latency_helper.printall() + except Exception as e: + print(str(e)) + break diff --git a/lib/bcc_helper.h b/lib/bcc_helper.h index ad2c443..d9d4926 100644 --- a/lib/bcc_helper.h +++ b/lib/bcc_helper.h @@ -26,6 +26,14 @@ typedef struct { \ u64 slot; \ } hist_key_type; +#define HIST_KEY_INITIALIZE(hist_key_type, hist_key, agg_key, slot) \ + hist_key_type hist_key = {agg_key, slot}; +#define HIST_KEY_GET_AGGKEY(hist_key_ptr) (&(hist_key_ptr)->agg_key) +#define HIST_KEY_GET_SLOT(hist_key_ptr) ((hist_key_ptr)->slot) +#define HIST_KEY_SET_SLOT(hist_key_ptr, nslot) (hist_key_ptr)->slot = nslot; +#define HIST_KEY_SET_AGGKEY(hist_key_ptr, agg_key) (hist_key_ptr)->agg_key = agg_key; + + /* * This function returns the slot, or histogram bucket, for a value based * on log linear distribution equivalent to dtrace llquantize(*, 10, 4, 10, 10). diff --git a/lib/bcchelper.py b/lib/bcchelper.py index 2e109fd..f424fe4 100755 --- a/lib/bcchelper.py +++ b/lib/bcchelper.py @@ -5,9 +5,11 @@ # from time import time +from enum import Enum import logging import re import signal +import subprocess """ BCC Helper This module contains the BCCHelper class to aid in writing BCC Tracing @@ -105,6 +107,8 @@ class BCCHelper: # DEFAULT_KEY_TYPE = 0 IP_KEY_TYPE = 1 + CUSTOM_KEY_TYPE = 2 + MAP_INDEX_TYPE = 3 # Format string for print histograms ESTAT_HIST_FORMAT = '{:<16} {:>16} {:<41}' ESTAT_HEADER_FORMAT = ' {:<20}{:>49}' @@ -232,11 +236,11 @@ def apply_normalization(self, name, value): value = value // n[1] return value - def add_key_type(self, name, type=None): + def add_key_type(self, name, type=None, display_function=None): """This key type expected in all aggregation keys.""" if type is None: type = self.DEFAULT_KEY_TYPE - self.key_types.append([name, type]) + self.key_types.append([name, type, display_function]) def walltime(self, ktime): """Convert to an epoch timestamp.""" @@ -414,6 +418,10 @@ def key_type_string(self, key, key_type): # if match: keystr = str(match.group()) + elif key_type[1] == self.CUSTOM_KEY_TYPE: + keystr = key_type[2](keystr) + elif key_type[1] == self.MAP_INDEX_TYPE: + keystr = key_type[2].displayName(int(keystr)) return keystr @@ -640,8 +648,9 @@ def items_to_string(self, agg_items): base_key = self.next_key(agg_items) if self.mode == self.ESTAT_PRINT_MODE: - outstr += estat_scalar_header + "\n" - outstr += estat_scalar_table + "\n\n" + if estat_scalar_table != "": + outstr += estat_scalar_header + "\n" + outstr += estat_scalar_table + "\n\n" return outstr @@ -650,3 +659,90 @@ def printall(self, clear=True): agg_items = self.get_ordered_items(clear) outstr = self.items_to_string(agg_items) print(outstr, end='', flush=True) + + +class BCCMapIndex(Enum): + """Provides index constants for bcc map keys""" + def __init__(self, index, csymbol, display): + self.index = index + self.csymbol = csymbol + self.display = display + + def index(self): + """The int value for this index.""" + return self.index + + def display(self): + """The display string for this index""" + return self.display + + def definition(self): + """The CPP definition for this index""" + return "-D" + self.csymbol + "=" + str(self.index) + "" + + @classmethod + def list(cls): + """List all indices in a subclass""" + return list(map(lambda c: c.value, cls)) + + @classmethod + def displayName(cls, i): + """The display name for a subclass index""" + return list(map(lambda c: c.value, cls))[i][2] + + @classmethod + def getCDefinitions(cls): + """Definitions for bcc C code""" + defs = [] + for index in cls: + defs.append(index.definition()) + return defs + + +class BCCPerCPUIntArray: + """Provide output support for a PerCPU Array indexed by a BCCMapIndex""" + def __init__(self, b, name, mapindex): + self.table = b[name] + self.name = name + self.mapindex = mapindex + + def printall(self, clear=True): + """ Print and clear all data from aggregations in the helper.""" + items = self.table.items() + if (clear): + self.table.clear() + + for item in items: + key = item[0] + value = item[1] + sum = 0 + # sum the values for each cpu + for v in value: + sum += v + if sum > 0: + print('{:20}'.format(self.mapindex.displayName(key.value)) + + ' : ' + str(sum)) + + +class BCCPoolCompare: + """Provide code to test for a zfs pool instance""" + def __init__(self, pool_name): + """leverage sdb to get a pointer for a pool name""" + self.pool_name = pool_name + pool_match = re.search(r'0x\w+', subprocess.getoutput( + "sudo sdb -e \'spa " + self.pool_name + "\'")) + if pool_match: + self.pool_pointer = pool_match.group(0) + else: + self.pool_pointer = None + + def get_pool_pointer(self): + """Return the kernel address for the spa_t object""" + return self.pool_pointer + + def get_pool_compare_code(self): + """Generate C POOL_COMPARE macro""" + if not self.pool_pointer: + return "#define POOL_COMPARE(spa) 1" + return str("#define POOL_COMPARE(spa) ((ulong)spa == " + + self.pool_pointer + ")") From 43b0a65f802e44ee938e9fa94390607c0cff97b7 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Mon, 26 Oct 2020 14:11:40 -0600 Subject: [PATCH 18/43] DLPX-72430 Estat zil fails to run because of unknown types (#50) --- bpf/standalone/zil.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 6db4666..7daeea5 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -37,9 +37,7 @@ #include #include #include -#include -#include -#include +#include #include #include #include @@ -256,11 +254,10 @@ b = BPF(text=bpf_text, cflags=["-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-include", + "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/spl", - "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/linux", - "-DCC_USING_FENTRY"]) + "-I/usr/src/zfs-" + KVER + "/include/spl"]) b.attach_kprobe(event="zfs_write", fn_name="zfs_write_entry") b.attach_kretprobe(event="zfs_write", fn_name="zfs_write_return") From ccd99203264095da1a0c37f49dffbe0d1e79e61b Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Mon, 2 Nov 2020 10:35:17 -0700 Subject: [PATCH 19/43] DLPX-72556 estat warning messages (#52) --- bpf/standalone/arc_prefetch.py | 1 - bpf/standalone/txg.py | 3 +-- bpf/stbtrace/zio.st | 3 +-- bpf/stbtrace/zpl.st | 3 +-- cmd/estat.py | 3 +-- 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/bpf/standalone/arc_prefetch.py b/bpf/standalone/arc_prefetch.py index 768f7b1..3c82019 100755 --- a/bpf/standalone/arc_prefetch.py +++ b/bpf/standalone/arc_prefetch.py @@ -270,7 +270,6 @@ class ArcLatencyIndex(BCCMapIndex): "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl/", "-I/usr/src/zfs-" + KVER + "/include/linux", - "-DCC_USING_FENTRY", "-DNCOUNT_INDEX=" + str(len(ArcCountIndex)), "-DNAVERAGE_INDEX=" + str(len(ArcLatencyIndex))] \ + ArcCountIndex.getCDefinitions() \ diff --git a/bpf/standalone/txg.py b/bpf/standalone/txg.py index 5ca0262..583ee91 100755 --- a/bpf/standalone/txg.py +++ b/bpf/standalone/txg.py @@ -349,8 +349,7 @@ def print_event(cpu, data, size): "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/linux", - "-DCC_USING_FENTRY"]) + "-I/usr/src/zfs-" + KVER + "/include/linux"]) b.attach_kprobe(event="spa_sync", fn_name="spa_sync_entry") b.attach_kretprobe(event="spa_sync", fn_name="spa_sync_return") diff --git a/bpf/stbtrace/zio.st b/bpf/stbtrace/zio.st index d4115a8..f09eb7d 100644 --- a/bpf/stbtrace/zio.st +++ b/bpf/stbtrace/zio.st @@ -130,8 +130,7 @@ b = BPF(text=bpf_text, cflags=["-include", "/usr/src/zfs-" + KVER + "/zfs_config.h", "-I/usr/src/zfs-" + KVER + "/include/", "-I/usr/src/zfs-" + KVER + "/include/spl/", - "-I/usr/src/zfs-" + KVER + "/include/linux", - "-DCC_USING_FENTRY"]) + "-I/usr/src/zfs-" + KVER + "/include/linux"]) b.attach_kretprobe(event="vdev_queue_io_to_issue", fn_name="vdev_queue_issue_return") diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index a22e91f..c10e918 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -178,8 +178,7 @@ b = BPF(text=bpf_text, "-include", "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/spl/", - "-DCC_USING_FENTRY"]) + "-I/usr/src/zfs-" + KVER + "/include/spl/"]) b.attach_kprobe(event="zfs_read", fn_name="zfs_read_start") b.attach_kprobe(event="zfs_write", fn_name="zfs_write_start") diff --git a/cmd/estat.py b/cmd/estat.py index 25e61d1..c0d9e16 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -394,8 +394,7 @@ class Args: "-include", "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", "-I/usr/src/zfs-" + KVER + "/include/", - "-I/usr/src/zfs-" + KVER + "/include/spl", - "-DCC_USING_FENTRY"] + "-I/usr/src/zfs-" + KVER + "/include/spl"] if script_arg: cflags.append("-DOPTARG=\"" + script_arg + "\"") From 2c79d813fc1a39dcef9547fc78b01cf523f3ad60 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Wed, 3 Feb 2021 10:34:06 -0700 Subject: [PATCH 20/43] Update estat iscsi, zvol, and zpl scripts. (#55) closes #53, closes #54 --- bpf/estat/iscsi.c | 7 +++---- bpf/estat/zpl.c | 30 +++++++++++++++--------------- bpf/estat/zvol.c | 37 +++++++------------------------------ 3 files changed, 25 insertions(+), 49 deletions(-) diff --git a/bpf/estat/iscsi.c b/bpf/estat/iscsi.c index 562b0f8..b406975 100644 --- a/bpf/estat/iscsi.c +++ b/bpf/estat/iscsi.c @@ -19,7 +19,6 @@ typedef struct { u64 ts; - u64 flags; u64 size; } iscsi_data_t; @@ -33,7 +32,6 @@ iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, { iscsi_data_t data = {}; data.ts = bpf_ktime_get_ns(); - data.flags = hdr->flags; data.size = hdr->data_length; iscsi_base_data.update((u64 *) &cmd, &data); @@ -56,6 +54,7 @@ aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) } // @@ kprobe|iscsit_build_rsp_pdu|iscsi_target_end +// @@ kprobe|iscsit_build_datain_pdu|iscsi_target_end int iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) { @@ -67,9 +66,9 @@ iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) return (0); // missed issue } - if (data->flags & ISCSI_FLAG_CMD_READ) { + if (cmd->data_direction == DMA_FROM_DEVICE) { aggregate_data(data, ts, READ_STR); - } else if (data->flags & ISCSI_FLAG_CMD_WRITE) { + } else if (cmd->data_direction & DMA_TO_DEVICE) { aggregate_data(data, ts, WRITE_STR); } iscsi_base_data.delete((u64 *) &cmd); diff --git a/bpf/estat/zpl.c b/bpf/estat/zpl.c index 09e3b8e..22eac4a 100644 --- a/bpf/estat/zpl.c +++ b/bpf/estat/zpl.c @@ -27,6 +27,10 @@ BPF_HASH(io_info_map, u32, io_info_t); #else #define POOL (OPTARG) #endif +#define ZFS_READ_SYNC_LENGTH 14 +#define ZFS_READ_ASYNC_LENGTH 15 +#define ZFS_WRITE_SYNC_LENGTH 15 +#define ZFS_WRITE_ASYNC_LENGTH 16 // TODO factor this out into a helper so that it isn't duplicated static inline bool @@ -97,24 +101,20 @@ zfs_read_write_exit(struct pt_regs *ctx, struct inode *ip, uio_t *uio) u64 delta = bpf_ktime_get_ns() - info->start_time; - char name[32]; + char name[16]; int offset; if (info->is_write) { - const char s[] = "zfs_write"; - __builtin_memcpy(&name, s, sizeof (s)); - offset = sizeof (s) - 1; + if (info->is_sync) { + __builtin_memcpy(name, "zfs_write sync", ZFS_WRITE_SYNC_LENGTH); + } else { + __builtin_memcpy(name, "zfs_write async", ZFS_WRITE_ASYNC_LENGTH); + } } else { - const char s[] = "zfs_read"; - __builtin_memcpy(&name, s, sizeof (s)); - offset = sizeof (s) - 1; - } - - if (info->is_sync) { - const char s[] = " sync"; - __builtin_memcpy(name + offset, s, sizeof (s)); - } else { - const char s[] = " async"; - __builtin_memcpy(name + offset, s, sizeof (s)); + if (info->is_sync) { + __builtin_memcpy(name, "zfs_read sync", ZFS_READ_SYNC_LENGTH); + } else { + __builtin_memcpy(name, "zfs_read async", ZFS_READ_ASYNC_LENGTH); + } } char axis = 0; diff --git a/bpf/estat/zvol.c b/bpf/estat/zvol.c index d799e3d..3dc170e 100644 --- a/bpf/estat/zvol.c +++ b/bpf/estat/zvol.c @@ -11,16 +11,19 @@ #include #include #include -#include #include +#include + #define ZVOL_WCE 0x8 #define ZVOL_READ 1 #define ZVOL_WRITE 2 #define NAME_LENGTH 6 -#define AXIS_LENGTH 5 +#define AXIS_LENGTH 6 #define READ_LENGTH 5 #define WRITE_LENGTH 6 +#define SYNC_LENGTH 5 +#define ASYNC_LENGTH 6 #ifndef OPTARG #define POOL "domain0" @@ -28,32 +31,6 @@ #define POOL (OPTARG) #endif -/* - * Copied and duplicated from "module/zfs/zvol.c" of the ZFS repository. - */ -typedef struct zvol_state { - char zv_name[MAXNAMELEN]; /* name */ - uint64_t zv_volsize; /* advertised space */ - uint64_t zv_volblocksize; /* volume block size */ - objset_t *zv_objset; /* objset handle */ - uint32_t zv_flags; /* ZVOL_* flags */ - uint32_t zv_open_count; /* open counts */ - uint32_t zv_changed; /* disk changed */ - zilog_t *zv_zilog; /* ZIL handle */ - rangelock_t zv_rangelock; /* for range locking */ - dnode_t *zv_dn; /* dnode hold */ - dev_t zv_dev; /* device id */ - struct gendisk *zv_disk; /* generic disk */ - struct request_queue *zv_queue; /* request queue */ - dataset_kstats_t zv_kstat; /* zvol kstats */ - list_node_t zv_next; /* next zvol_state_t linkage */ - uint64_t zv_hash; /* name hash */ - struct hlist_node zv_hlink; /* hash link */ - kmutex_t zv_state_lock; /* protects zvol_state_t */ - atomic_t zv_suspend_ref; /* refcount for suspend */ - krwlock_t zv_suspend_lock; /* suspend lock */ -} zvol_state_t; - // Structure to hold thread local data typedef struct { u64 ts; @@ -141,10 +118,10 @@ zvol_return(struct pt_regs *ctx) __builtin_memcpy(&name, "read", READ_LENGTH); } else if (sync) { __builtin_memcpy(&name, "write", WRITE_LENGTH); - __builtin_memcpy(&axis, "sync", WRITE_LENGTH); + __builtin_memcpy(&axis, "sync", SYNC_LENGTH); } else { __builtin_memcpy(&name, "write", WRITE_LENGTH); - __builtin_memcpy(&axis, "async", WRITE_LENGTH); + __builtin_memcpy(&axis, "async", ASYNC_LENGTH); } AGGREGATE_DATA(name, axis, delta, data->bytes); zvol_base_data.delete(&pid); From f0bb551969013b57a78e36783c1a95fba3d19c7c Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Tue, 23 Feb 2021 19:39:14 +0000 Subject: [PATCH 21/43] estat zil script compilation error on 6.1 closes #57 --- bpf/standalone/zil.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 7daeea5..10ff3d7 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -115,14 +115,14 @@ return 0; } -int zfs_write_entry(struct pt_regs *ctx, struct inode *ip, -uio_t *uio, int ioflag) +int zfs_write_entry(struct pt_regs *ctx, struct znode *zn, +void *uio, int ioflag) { u32 tid = bpf_get_current_pid_tgid(); zil_tid_info_t info = {}; info.write_ts = bpf_ktime_get_ns(); - zfsvfs_t *zfsvfs = ip->i_sb->s_fs_info; + zfsvfs_t *zfsvfs = zn->z_inode.i_sb->s_fs_info; objset_t *z_os = zfsvfs->z_os; spa_t *spa = z_os->os_spa; if (!equal_to_pool(spa->spa_name)) From b095412722193d723496471bbe397608eb2c6df3 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Tue, 23 Feb 2021 19:46:49 +0000 Subject: [PATCH 22/43] estat zil script always reports an average of 10 allocations closes #58 --- bpf/standalone/zil.py | 119 +++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 25 deletions(-) diff --git a/bpf/standalone/zil.py b/bpf/standalone/zil.py index 10ff3d7..c02994e 100755 --- a/bpf/standalone/zil.py +++ b/bpf/standalone/zil.py @@ -67,7 +67,6 @@ u64 lwb_ts; u64 io_ts; int sync; - int alloc_count; } zil_tid_info_t; BPF_HASH(zil_info_map, u32, zil_tid_info_t); @@ -83,7 +82,7 @@ HIST_KEY(zil_hist_key_t, zil_key_t); BPF_HASH(average_latency, zil_key_t, average_t); -BPF_HASH(average_allocs, zil_key_t, average_t); +BPF_HASH(call_counts, zil_key_t, u64); BPF_HASH(zil_latency, zil_hist_key_t, u64); static inline bool equal_to_pool(char *str) @@ -111,10 +110,21 @@ avg->sum += delta; hist_key.slot = bpf_log2l(delta); zil_latency.increment(hist_key); + call_counts.increment(key); return 0; } +static int count_call(char *name) { + zil_key_t key = {}; + __builtin_memcpy(&key.name, name, NAME_LENGTH); + key.t = 1; + key.cpuid = bpf_get_smp_processor_id(); + call_counts.increment(key); + return 0; +} + + int zfs_write_entry(struct pt_regs *ctx, struct znode *zn, void *uio, int ioflag) { @@ -128,7 +138,6 @@ if (!equal_to_pool(spa->spa_name)) return 0; - info.alloc_count = 0; info.sync = ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS; zil_info_map.update(&tid, &info); @@ -154,13 +163,50 @@ return 0; } +int zfs_fsync_entry(struct pt_regs *ctx, struct znode *zn) +{ + u32 tid = bpf_get_current_pid_tgid(); + zil_tid_info_t info = {}; + + info.write_ts = bpf_ktime_get_ns(); + zfsvfs_t *zfsvfs = zn->z_inode.i_sb->s_fs_info; + objset_t *z_os = zfsvfs->z_os; + spa_t *spa = z_os->os_spa; + if (!equal_to_pool(spa->spa_name)) + return 0; + + zil_info_map.update(&tid, &info); + return 0; +} + +int zfs_fsync_return(struct pt_regs *cts) +{ + u32 tid = bpf_get_current_pid_tgid(); + zil_tid_info_t *info = zil_info_map.lookup(&tid); + if (info == NULL) { + return 0; + } + + // Record if called zil_commit + if (info->commit_ts != 0) { + latency_average_and_histogram("zfs_fysnc", + (bpf_ktime_get_ns() - info->write_ts) / 1000); + } + + zil_info_map.delete(&tid); + return 0; +} + + int zil_commit_entry(struct pt_regs *ctx) { u32 tid = bpf_get_current_pid_tgid(); zil_tid_info_t *info = zil_info_map.lookup(&tid); - if (info == NULL) + if (info == NULL) { return 0; + } + info->commit_ts = bpf_ktime_get_ns(); return 0; } @@ -176,14 +222,6 @@ latency_average_and_histogram("zil_commit", (bpf_ktime_get_ns() - info->commit_ts) / 1000); - zil_key_t key = {}; - key.t = 1; - key.cpuid = bpf_get_smp_processor_id(); - __builtin_memcpy(&key.name, "Allocations", NAME_LENGTH); - average_t zero_avg = ZERO_AVERAGE; - average_t *avg = average_allocs.lookup_or_init(&key, &zero_avg); - avg->count++; - avg->sum += 10; //info->alloc_count; return 0; } @@ -191,11 +229,10 @@ int zil_get_commit_list_entry(struct pt_regs *ctx) { u32 tid = bpf_get_current_pid_tgid(); - u64 ts = bpf_ktime_get_ns(); zil_tid_info_t *info = zil_info_map.lookup(&tid); if (info == NULL) return 0; - info->lwb_ts = ts; + info->lwb_ts = bpf_ktime_get_ns(); return 0; } @@ -207,7 +244,9 @@ if (info == NULL) { return 0; } - + if (info->lwb_ts == 0) { + return 0; + } return latency_average_and_histogram("allocation", (bpf_ktime_get_ns() - info->lwb_ts) / 1000); @@ -243,10 +282,33 @@ if (info == NULL) { return 0; } - info->alloc_count++; + count_call("block allocations"); + return 0; +} + +int zil_commit_waiter_skip_entry(struct pt_regs *cts) +{ + u32 tid = bpf_get_current_pid_tgid(); + zil_tid_info_t *info = zil_info_map.lookup(&tid); + if (info == NULL) { + return 0; + } + + count_call("waiter skip"); return 0; } +int zil_commit_writer_stall_entry(struct pt_regs *cts) +{ + u32 tid = bpf_get_current_pid_tgid(); + zil_tid_info_t *info = zil_info_map.lookup(&tid); + if (info == NULL) { + return 0; + } + + count_call("writer stall"); + return 0; +} """ # load BPF program @@ -261,6 +323,8 @@ b.attach_kprobe(event="zfs_write", fn_name="zfs_write_entry") b.attach_kretprobe(event="zfs_write", fn_name="zfs_write_return") +b.attach_kprobe(event="zfs_fsync", fn_name="zfs_fsync_entry") +b.attach_kretprobe(event="zfs_fsync", fn_name="zfs_fsync_return") b.attach_kprobe(event="zil_commit", fn_name="zil_commit_entry") b.attach_kretprobe(event="zil_commit", fn_name="zil_commit_return") b.attach_kretprobe(event="zil_process_commit_list", @@ -271,19 +335,24 @@ fn_name="zil_commit_waiter_entry") b.attach_kretprobe(event="zil_commit_waiter", fn_name="zil_commit_waiter_return") -b.attach_kretprobe(event="zio_alloc_zil", fn_name="zio_alloc_zil_return") +b.attach_kretprobe(event="zio_alloc_zil", + fn_name="zio_alloc_zil_return") +b.attach_kprobe(event="zil_commit_waiter_skip", + fn_name="zil_commit_waiter_skip_entry") +b.attach_kprobe(event="zil_commit_writer_stall", + fn_name="zil_commit_writer_stall_entry") latency_helper = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) latency_helper.add_aggregation("average_latency", - BCCHelper.AVERAGE_AGGREGATION, "avg") + BCCHelper.AVERAGE_AGGREGATION, "avg latency") latency_helper.add_aggregation("zil_latency", BCCHelper.LOG_HISTOGRAM_AGGREGATION, "latency") latency_helper.add_key_type("name") -alloc_helper = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) -alloc_helper.add_aggregation("average_allocs", - BCCHelper.AVERAGE_AGGREGATION, "avg") -alloc_helper.add_key_type("name") +call_count_helper = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) +call_count_helper.add_aggregation("call_counts", + BCCHelper.COUNT_AGGREGATION, "count") +call_count_helper.add_key_type("name") if (not args.collection_sec): print(" Tracing enabled... Hit Ctrl-C to end.") @@ -294,7 +363,7 @@ try: print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) latency_helper.printall() - alloc_helper.printall() + call_count_helper.printall() exit(0) except Exception as e: print(str(e)) @@ -307,12 +376,12 @@ except KeyboardInterrupt: print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) latency_helper.printall() - alloc_helper.printall() + call_count_helper.printall() break try: print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) latency_helper.printall() - alloc_helper.printall() + call_count_helper.printall() except Exception as e: print(str(e)) break From 8d5620ba4fd0c28f47aa19fbfb51ca8fe5854293 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Wed, 19 May 2021 15:46:27 -0600 Subject: [PATCH 23/43] DLPX-75711 [Backport of DLPX-75405 to 6.0.9.0] nfs_threads script should flush output and include timestamps (#63) --- cmd/nfs_threads.py | 122 ++++++++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 47 deletions(-) diff --git a/cmd/nfs_threads.py b/cmd/nfs_threads.py index 674c6d8..c633552 100755 --- a/cmd/nfs_threads.py +++ b/cmd/nfs_threads.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (c) 2020 by Delphix. All rights reserved. +# Copyright (c) 2020-2021 by Delphix. All rights reserved. # # SPDX-License-Identifier: GPL-2.0-or-later # @@ -9,22 +9,24 @@ Display NFS thread usage info along with NFS I/O context. Output Sample: - - packets sockets threads threads metadata read read write write - arrived enqueued woken used calls iops thruput iops thruput - 4589 0 4589 25 16 273 3.6MB 212 2.6MB - 4735 0 4735 8 1 287 3.8MB 212 2.7MB - 4693 0 4693 10 0 280 3.7MB 216 2.7MB - 4625 0 4625 15 0 278 3.7MB 212 2.6MB - 4687 0 4687 7 1 285 3.8MB 210 2.6MB - 4701 0 4701 12 0 285 3.8MB 215 2.7MB +packets sockets threads threads metadata read read write write +arrived enqueued woken used calls iops thruput iops thruput + 78683 538 78145 57 209 3390 142.5MB 9014 107.0MB + 106114 4527 101587 63 50 4211 166.8MB 13294 133.0MB + 110220 1511 108709 61 10 4347 10.7MB 13767 137.5MB + 80630 4741 75889 62 50 4218 179.4MB 8743 107.9MB + 115463 11400 104063 62 21 4231 179.4MB 15404 150.5MB ''' +import os import psutil from signal import signal, SIGINT import sys from time import sleep +import datetime +import argparse +PROCFS_NFSD = "/proc/fs/nfsd" POOL_STATS = "/proc/fs/nfsd/pool_stats" NFSD_STATS = "/proc/net/rpc/nfsd" @@ -33,18 +35,26 @@ H2 = ['arrived', 'enqueued', 'woken', 'used', 'calls', 'iops', 'thruput', 'iops', 'thruput'] -INTERVAL = 5 + +def parse_cmdline(): + parser = argparse.ArgumentParser( + description='Display nfsd thread usage info along with NFS I/O ' + 'context') + parser.add_argument( + '--interval', type=int, choices=range(1, 31), + default=5, help='sampling interval in seconds (defaults to 5)') + return parser.parse_args() def server_stopped(message=''): - print("NFS Server Stopped {}".format(message)) - sys.exit() + print("*NFS Server Stopped {}".format(message)) def print_header(header): + print(' '*19, end='') for col in header: print('{0:>10}'.format(col), end='') - print() + print(flush=True) def pool_stats(): @@ -56,10 +66,10 @@ def pool_stats(): packets = int(fields[1]) enqueued = int(fields[2]) woken = int(fields[3]) - timedout = int(fields[4]) - return packets, enqueued, woken, timedout - except OSError: + return packets, enqueued, woken, None + except OSError as e: server_stopped() + return 0, 0, 0, e def nfs_stats(): @@ -93,21 +103,22 @@ def nfs_stats(): metadata += int(fields[11]) metadata += int(fields[17]) metadata += int(fields[36]) - return readbytes, writebytes, readops, writeops, metadata - except OSError: + return readbytes, writebytes, readops, writeops, metadata, None + except OSError as e: server_stopped() + return 0, 0, 0, 0, 0, e -def context_switches(pids): - "Return a list of context switches per process in pids" +def cpu_time(pids): + "Return a list of time spent on cpu per process in pids" ls = [] for pid in pids: try: - pctxsw = psutil.Process(pid).num_ctx_switches() - ls.append(pctxsw.voluntary + pctxsw.involuntary) - except psutil.NoSuchProcess: + ls.append(psutil.Process(pid).cpu_times().system) + except psutil.NoSuchProcess as e: server_stopped() - return ls + return None, e + return ls, None def nfsd_processes(): @@ -132,35 +143,44 @@ def print_thruput(value): print('{0:>8}KB'.format(int(value / 1024)), end='') -def print_line(): +def print_line(interval): + lines = 0 pids = nfsd_processes() - prevSwitches = context_switches(pids) - prevPackets, prevEnqueued, prevWoken, prevTimedout = pool_stats() - prevRB, prevWB, prevRO, prevWO, prevMeta = nfs_stats() + prevCpuTime, e1 = cpu_time(pids) + prevPackets, prevEnqueued, prevWoken, e2 = pool_stats() + prevRB, prevWB, prevRO, prevWO, prevMeta, e3 = nfs_stats() + if e1 or e2 or e3: + return - while(not sleep(INTERVAL)): - nextSwitches = context_switches(pids) - nextPackets, nextEnqueued, nextWoken, nextTimedout = pool_stats() - nextRB, nextWB, nextRO, nextWO, nextMeta = nfs_stats() + while(not sleep(interval)): + nextCpuTime, e1 = cpu_time(pids) + nextPackets, nextEnqueued, nextWoken, e2 = pool_stats() + nextRB, nextWB, nextRO, nextWO, nextMeta, e3 = nfs_stats() + if e1 or e2 or e3: + return + # + # Count threads that used cpu time in this interval + # threads = 0 - for i in range(0, len(prevSwitches)): - if not prevSwitches[i] == nextSwitches[i]: + for i in range(0, len(prevCpuTime)): + if not prevCpuTime[i] == nextCpuTime[i]: threads += 1 - threads -= nextTimedout - prevTimedout - prevSwitches = nextSwitches.copy() + prevCpuTime = nextCpuTime.copy() # # The published 'sockets-enqueued' value needs adjustment # enqueued = (nextEnqueued - prevEnqueued) - (nextWoken - prevWoken) + if enqueued < 0: + enqueued = 0 # # For IOPS values less than 10 display with decimal # - readOps = (nextRO - prevRO) / INTERVAL - writeOps = (nextWO - prevWO) / INTERVAL + readOps = (nextRO - prevRO) / interval + writeOps = (nextWO - prevWO) / interval readOps = int(readOps) if readOps > 9 else round(readOps, 1) writeOps = int(writeOps) if writeOps > 9 else round(writeOps, 1) @@ -173,35 +193,43 @@ def print_line(): if nextWB < prevWB: prevWB = 0 + if lines % 48 == 0: + print_header(H1) + print_header(H2) + + print('{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()), end='') print_value(nextPackets - prevPackets) print_value(enqueued) print_value(nextWoken - prevWoken) print_value(threads) print_value(nextMeta - prevMeta) print_value(readOps) - print_thruput((nextRB - prevRB) / INTERVAL) + print_thruput((nextRB - prevRB) / interval) print_value(writeOps) - print_thruput((nextWB - prevWB) / INTERVAL) - print() + print_thruput((nextWB - prevWB) / interval) + print(flush=True) prevPackets = nextPackets prevEnqueued = nextEnqueued prevWoken = nextWoken - prevTimedout = nextTimedout prevMeta = nextMeta prevRB = nextRB prevWB = nextWB prevRO = nextRO prevWO = nextWO + lines += 1 def handler(signal_received, frame): - print() + print(flush=True) sys.exit(0) signal(SIGINT, handler) -print_header(H1) -print_header(H2) -print_line() +arguments = parse_cmdline() + +while True: + if os.path.exists(PROCFS_NFSD): + print_line(arguments.interval) + sleep(2) From ac43e686f698d2d508beda24d79cc874cece7293 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 11 Jun 2021 09:28:59 -0600 Subject: [PATCH 24/43] DLPX-75470 estat zpl and arc_prefetch scripts need znode parameter (#61) (#64) --- bpf/estat/zpl.c | 14 +++++++------- bpf/standalone/arc_prefetch.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bpf/estat/zpl.c b/bpf/estat/zpl.c index 22eac4a..8ea5d0b 100644 --- a/bpf/estat/zpl.c +++ b/bpf/estat/zpl.c @@ -46,11 +46,11 @@ equal_to_pool(char *str) } static inline int -zfs_read_write_entry(io_info_t *info, struct inode *ip, uio_t *uio, int flags) +zfs_read_write_entry(io_info_t *info, struct znode *zn, zfs_uio_t *uio, int flags) { // Essentially ITOZSB, but written explicitly so that BCC can insert // the necessary calls to bpf_probe_read. - zfsvfs_t *zfsvfs = ip->i_sb->s_fs_info; + zfsvfs_t *zfsvfs = zn->z_inode.i_sb->s_fs_info; objset_t *z_os = zfsvfs->z_os; spa_t *spa = z_os->os_spa; @@ -72,26 +72,26 @@ zfs_read_write_entry(io_info_t *info, struct inode *ip, uio_t *uio, int flags) // @@ kprobe|zfs_read|zfs_read_entry int -zfs_read_entry(struct pt_regs *ctx, struct inode *ip, uio_t *uio, int flags) +zfs_read_entry(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, int flags) { io_info_t info = {}; info.is_write = false; - return (zfs_read_write_entry(&info, ip, uio, flags)); + return (zfs_read_write_entry(&info, zn, uio, flags)); } // @@ kprobe|zfs_write|zfs_write_entry int -zfs_write_entry(struct pt_regs *ctx, struct inode *ip, uio_t *uio, int flags) +zfs_write_entry(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, int flags) { io_info_t info = {}; info.is_write = true; - return (zfs_read_write_entry(&info, ip, uio, flags)); + return (zfs_read_write_entry(&info, zn, uio, flags)); } // @@ kretprobe|zfs_read|zfs_read_write_exit // @@ kretprobe|zfs_write|zfs_read_write_exit int -zfs_read_write_exit(struct pt_regs *ctx, struct inode *ip, uio_t *uio) +zfs_read_write_exit(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio) { u32 tid = bpf_get_current_pid_tgid(); io_info_t *info = io_info_map.lookup(&tid); diff --git a/bpf/standalone/arc_prefetch.py b/bpf/standalone/arc_prefetch.py index 3c82019..c7ef659 100755 --- a/bpf/standalone/arc_prefetch.py +++ b/bpf/standalone/arc_prefetch.py @@ -89,14 +89,14 @@ BPF_HASH(read_average, lat_key, average_t); BPF_PERCPU_ARRAY(arc_count, u32, NCOUNT_INDEX); -int zfs_read_entry(struct pt_regs *ctx, struct inode *ip) +int zfs_read_entry(struct pt_regs *ctx, struct znode *zn) { u32 tid = bpf_get_current_pid_tgid(); u64 ts = bpf_ktime_get_ns(); arc_prefetch_info_t info = {ts}; // filter by pool - zfsvfs_t *zfsvfs = ip->i_sb->s_fs_info; + zfsvfs_t *zfsvfs = zn->z_inode.i_sb->s_fs_info; objset_t *z_os = zfsvfs->z_os; spa_t *spa = z_os->os_spa; if (POOL_COMPARE(spa)) From 3f98846d4ef536dabb98a63842807c142f9ded6e Mon Sep 17 00:00:00 2001 From: Pavel Zakharov Date: Tue, 3 Aug 2021 11:38:42 -0400 Subject: [PATCH 25/43] TOOL-11951 [Backport of TOOL-11731] performance-diagnostics: build-depends on python3-minimal instead of python-minimal (#67) --- debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/control b/debian/control index f50a22d..b9d7a06 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python-minimal, python3-psutil +Depends: python3-bcc, python3-minimal, python3-psutil Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. From 5cb1777e4b422e3856f465a107eeee3227ee920c Mon Sep 17 00:00:00 2001 From: mr-t-73 <66187826+mr-t-73@users.noreply.github.com> Date: Thu, 9 Dec 2021 14:42:11 +1100 Subject: [PATCH 26/43] estat backend-io script complains about missing blk_start_request kprobecloses #45 (#68) Co-authored-by: Brad Lewis --- bpf/estat/backend-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/bpf/estat/backend-io.c b/bpf/estat/backend-io.c index 3ddf679..6943d86 100644 --- a/bpf/estat/backend-io.c +++ b/bpf/estat/backend-io.c @@ -29,7 +29,6 @@ typedef struct { BPF_HASH(io_base_data, u64, io_data_t); -// @@ kprobe|blk_start_request|disk_io_start // @@ kprobe|blk_mq_start_request|disk_io_start int disk_io_start(struct pt_regs *ctx, struct request *reqp) From fa44f673c8f3fd1932d77e8655a0fa0f0700d3b4 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Thu, 9 Dec 2021 08:57:06 -0700 Subject: [PATCH 27/43] DLPX-72683 [Backport of DLPX-72683 to 6.0.12.0] estat utility's help text is missing documentation for arc_prefetch (#70) --- cmd/estat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/estat.py b/cmd/estat.py index c0d9e16..8d64b23 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -95,6 +95,11 @@ def die(*args, **kwargs): The subcommands listed in the following section are stand alone tracers. Each has it's own options as detailed below. + estat arc_prefetch [options] + Collect arc_prefetch statistics for 5 second intervals. + -h show txg help message and exit + -p POOL set the pool to monitor (default: domain0) + estat txg [options] Collect spa_sync statistics for each txg. -h show txg help message and exit From 3d0e3f2ec20bb09003f6d30b7b4b8efebff26a36 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Thu, 9 Dec 2021 09:05:41 -0700 Subject: [PATCH 28/43] DLPX-77845 [Backport of DLPX-77532 to 6.0.12.0] The iscsi estat script needs to be fixed so it reports data correctly (#71) --- bpf/estat/iscsi.c | 67 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/bpf/estat/iscsi.c b/bpf/estat/iscsi.c index b406975..e849b91 100644 --- a/bpf/estat/iscsi.c +++ b/bpf/estat/iscsi.c @@ -4,6 +4,24 @@ * SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This tracer provides latency and througphput data for the iscsi target + * read and writes. The latency is measured from entering + * iscsit_process_scsi_cmd() to exiting iscsit_target_response(). The + * thread that enters iscsi_process_scsi_cmd() will put an entry on the + * request task queue. This entry will be removed from the queue and + * processed by another thread which calls iscsi_target_response. + * The tracing is performed by three probe functions. + * 1. iscsi_target_start - This function saves a timestamp of the entry + * into iscsit_process_scsi_cmd() hashed by a pointer to the iscssi_cmd. + * 2. iscsi_target_response - This function serves the purpose of moving + * the timestamp saved by iscsi_target_start to a thread id based hash. + * Also the size and direction are stored in the hash since kretprobes + * do not have access to parameters. + * 3. iscsi_target_end - This function retrieves the hashed base data by + * thread id and performs the data aggregation. + */ + #include #include #include @@ -20,20 +38,20 @@ typedef struct { u64 ts; u64 size; + u32 direction; } iscsi_data_t; -BPF_HASH(iscsi_base_data, u64, iscsi_data_t); +BPF_HASH(iscsi_start_ts, u64, u64); +BPF_HASH(iscsi_base_data, u32, iscsi_data_t); // @@ kprobe|iscsit_process_scsi_cmd|iscsi_target_start int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) { - iscsi_data_t data = {}; - data.ts = bpf_ktime_get_ns(); - data.size = hdr->data_length; - iscsi_base_data.update((u64 *) &cmd, &data); + u64 ts = bpf_ktime_get_ns(); + iscsi_start_ts.update((u64 *) &cmd, &ts); return (0); } @@ -53,25 +71,48 @@ aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) return (0); } -// @@ kprobe|iscsit_build_rsp_pdu|iscsi_target_end -// @@ kprobe|iscsit_build_datain_pdu|iscsi_target_end + + +// @@ kprobe|iscsit_response_queue|iscsi_target_response int -iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) +iscsi_target_response(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t data = {}; + + u64 *tsp = iscsi_start_ts.lookup((u64 *) &cmd); + if (tsp == 0) { + return (0); // missed issue + } + + data.ts = *tsp; + data.size = cmd->se_cmd.data_length; + data.direction = cmd->data_direction; + + iscsi_base_data.update(&tid, &data); + iscsi_start_ts.delete((u64 *) &cmd); + + return (0); +} + +// @@ kretprobe|iscsit_response_queue|iscsi_target_end +int +iscsi_target_end(struct pt_regs *ctx) { u64 ts = bpf_ktime_get_ns(); - iscsi_data_t *data = iscsi_base_data.lookup((u64 *) &cmd); - u64 delta; + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t *data = iscsi_base_data.lookup(&tid); if (data == 0) { return (0); // missed issue } - if (cmd->data_direction == DMA_FROM_DEVICE) { + if (data->direction == DMA_FROM_DEVICE) { aggregate_data(data, ts, READ_STR); - } else if (cmd->data_direction & DMA_TO_DEVICE) { + } else if (data->direction == DMA_TO_DEVICE) { aggregate_data(data, ts, WRITE_STR); } - iscsi_base_data.delete((u64 *) &cmd); + iscsi_base_data.delete(&tid); return (0); } From 729786d4865628b12b4f882f69b7921203ef8fb0 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Tue, 21 Dec 2021 13:41:38 -0700 Subject: [PATCH 29/43] DLPX-78812 Disk IO analytics collector not running on aws (#73) --- bpf/estat/backend-io.c | 2 +- bpf/stbtrace/io.st | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bpf/estat/backend-io.c b/bpf/estat/backend-io.c index 6943d86..12f6797 100644 --- a/bpf/estat/backend-io.c +++ b/bpf/estat/backend-io.c @@ -43,7 +43,7 @@ disk_io_start(struct pt_regs *ctx, struct request *reqp) return (0); } -// @@ kprobe|blk_account_io_completion|disk_io_done +// @@ kprobe|blk_account_io_done|disk_io_done int disk_io_done(struct pt_regs *ctx, struct request *reqp) { diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 266957a..27549d0 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -128,7 +128,7 @@ b = BPF(text=bpf_text) if BPF.get_kprobe_functions(b'blk_start_request'): b.attach_kprobe(event="blk_start_request", fn_name="disk_io_start") b.attach_kprobe(event="blk_mq_start_request", fn_name="disk_io_start") -b.attach_kprobe(event="blk_account_io_completion", fn_name="disk_io_done") +b.attach_kprobe(event="blk_account_io_done", fn_name="disk_io_done") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) From 2bb739db89d3c63b3e385adadb46056fd3ebf55f Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Wed, 22 Dec 2021 09:33:35 -0700 Subject: [PATCH 30/43] DLPX-78888 [Backport of DLPX-78812 to 6.0.12] Disk IO analytics collector not running on aws (#75) --- bpf/estat/backend-io.c | 2 +- bpf/stbtrace/io.st | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bpf/estat/backend-io.c b/bpf/estat/backend-io.c index 3ddf679..e2a5488 100644 --- a/bpf/estat/backend-io.c +++ b/bpf/estat/backend-io.c @@ -44,7 +44,7 @@ disk_io_start(struct pt_regs *ctx, struct request *reqp) return (0); } -// @@ kprobe|blk_account_io_completion|disk_io_done +// @@ kprobe|blk_account_io_done|disk_io_done int disk_io_done(struct pt_regs *ctx, struct request *reqp) { diff --git a/bpf/stbtrace/io.st b/bpf/stbtrace/io.st index 266957a..27549d0 100755 --- a/bpf/stbtrace/io.st +++ b/bpf/stbtrace/io.st @@ -128,7 +128,7 @@ b = BPF(text=bpf_text) if BPF.get_kprobe_functions(b'blk_start_request'): b.attach_kprobe(event="blk_start_request", fn_name="disk_io_start") b.attach_kprobe(event="blk_mq_start_request", fn_name="disk_io_start") -b.attach_kprobe(event="blk_account_io_completion", fn_name="disk_io_done") +b.attach_kprobe(event="blk_account_io_done", fn_name="disk_io_done") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) From 7a1656aa8bef454563b1e5258f91f1b0a3371a6d Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 21 Jan 2022 08:40:48 -0700 Subject: [PATCH 31/43] DLPX-78745 Apply estat iscsi approach to stbtrace script (#72) --- bpf/stbtrace/iscsi.st | 75 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/bpf/stbtrace/iscsi.st b/bpf/stbtrace/iscsi.st index bb17237..d4a758f 100755 --- a/bpf/stbtrace/iscsi.st +++ b/bpf/stbtrace/iscsi.st @@ -38,8 +38,8 @@ bpf_text += """ #define OP_NAME_LEN 6 typedef struct { u64 ts; - u64 flags; u64 size; + u32 direction; } iscsi_data_t; // Key structure for scalar aggegations maps @@ -52,7 +52,8 @@ typedef struct { HIST_KEY(iscsi_hist_key_t, iscsi_key_t); -BPF_HASH(iscsi_base_data, u64, iscsi_data_t); +BPF_HASH(iscsi_start_ts, u64, u64); +BPF_HASH(iscsi_base_data, u32, iscsi_data_t); $maps:{map| BPF_HASH($map.name$, iscsi_key_t, $map.type$); }$ @@ -64,13 +65,31 @@ BPF_HASH($hist.name$, iscsi_hist_key_t, u64); int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) { - iscsi_data_t data = {}; - data.ts = bpf_ktime_get_ns(); - data.flags = hdr->flags; - data.size = hdr->data_length; - iscsi_base_data.update((u64 *) &cmd, &data); + u64 ts = bpf_ktime_get_ns(); + iscsi_start_ts.update((u64 *) &cmd, &ts); - return 0; + return (0); +} + +int iscsi_target_response(struct pt_regs *ctx, struct iscsi_conn *conn, + struct iscsi_cmd *cmd, int state) +{ + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t data = {}; + + u64 *tsp = iscsi_start_ts.lookup((u64 *) &cmd); + if (tsp == 0) { + return (0); // missed issue + } + + data.ts = *tsp; + data.size = cmd->se_cmd.data_length; + data.direction = cmd->data_direction; + + iscsi_base_data.update(&tid, &data); + iscsi_start_ts.delete((u64 *) &cmd); + + return (0); } static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) @@ -99,33 +118,31 @@ static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) return 0; } -int iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) +int iscsi_target_end(struct pt_regs *ctx) { - u64 ts = bpf_ktime_get_ns(); - iscsi_data_t *data = iscsi_base_data.lookup((u64 *) &cmd); - u64 delta; - iscsi_key_t key = {}; - char *opstr; - - if (data == 0) { - return 0; // missed issue - } - - if (data->flags & ISCSI_FLAG_CMD_READ) { - aggregate_data(data, ts, READ_STR); - } else if (data->flags & ISCSI_FLAG_CMD_WRITE) { - aggregate_data(data, ts, WRITE_STR); - } - iscsi_base_data.delete((u64 *) &cmd); - - return 0; + u64 ts = bpf_ktime_get_ns(); + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t *data = iscsi_base_data.lookup(&tid); + + if (data == 0) { + return (0); // missed issue + } + + if (data->direction == DMA_FROM_DEVICE) { + aggregate_data(data, ts, READ_STR); + } else if (data->direction == DMA_TO_DEVICE) { + aggregate_data(data, ts, WRITE_STR); + } + iscsi_base_data.delete(&tid); + + return (0); } - """ # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="iscsit_process_scsi_cmd", fn_name="iscsi_target_start") -b.attach_kprobe(event="iscsit_build_rsp_pdu", fn_name="iscsi_target_end") +b.attach_kprobe(event="iscsit_response_queue", fn_name="iscsi_target_response") +b.attach_kretprobe(event="iscsit_response_queue", fn_name="iscsi_target_end") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) $maps:{map| From d51210a5ca6bf8f1e8fc923353de26d493757bd5 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 21 Jan 2022 11:46:26 -0700 Subject: [PATCH 32/43] DLPX-78891 stbtrace zpl fails due to uid_t (#76) --- bpf/stbtrace/zpl.st | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index c10e918..1b219d2 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -28,7 +28,8 @@ bpf_text += """ #include #include -#include +#include +#include // Definitions for this script #define READ_STR "read" @@ -67,9 +68,9 @@ BPF_HASH($hist.name$, zpl_hist_key_t, u64); }$ // Probe functions to initialize thread local data -int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_read_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); @@ -81,9 +82,9 @@ int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) } // Probe functions to initialize thread local data -int zfs_write_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_write_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); From ded967203dcceb286546d0f5293b0bf9e5afa594 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Fri, 21 Jan 2022 13:50:33 -0700 Subject: [PATCH 33/43] DLPX-79245 - [Backport of DLPX-78745 to 6.0.13.0] Apply estat iscsi approach to stbtrace script (#77) --- bpf/stbtrace/iscsi.st | 75 ++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/bpf/stbtrace/iscsi.st b/bpf/stbtrace/iscsi.st index bb17237..d4a758f 100755 --- a/bpf/stbtrace/iscsi.st +++ b/bpf/stbtrace/iscsi.st @@ -38,8 +38,8 @@ bpf_text += """ #define OP_NAME_LEN 6 typedef struct { u64 ts; - u64 flags; u64 size; + u32 direction; } iscsi_data_t; // Key structure for scalar aggegations maps @@ -52,7 +52,8 @@ typedef struct { HIST_KEY(iscsi_hist_key_t, iscsi_key_t); -BPF_HASH(iscsi_base_data, u64, iscsi_data_t); +BPF_HASH(iscsi_start_ts, u64, u64); +BPF_HASH(iscsi_base_data, u32, iscsi_data_t); $maps:{map| BPF_HASH($map.name$, iscsi_key_t, $map.type$); }$ @@ -64,13 +65,31 @@ BPF_HASH($hist.name$, iscsi_hist_key_t, u64); int iscsi_target_start(struct pt_regs *ctx, struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr) { - iscsi_data_t data = {}; - data.ts = bpf_ktime_get_ns(); - data.flags = hdr->flags; - data.size = hdr->data_length; - iscsi_base_data.update((u64 *) &cmd, &data); + u64 ts = bpf_ktime_get_ns(); + iscsi_start_ts.update((u64 *) &cmd, &ts); - return 0; + return (0); +} + +int iscsi_target_response(struct pt_regs *ctx, struct iscsi_conn *conn, + struct iscsi_cmd *cmd, int state) +{ + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t data = {}; + + u64 *tsp = iscsi_start_ts.lookup((u64 *) &cmd); + if (tsp == 0) { + return (0); // missed issue + } + + data.ts = *tsp; + data.size = cmd->se_cmd.data_length; + data.direction = cmd->data_direction; + + iscsi_base_data.update(&tid, &data); + iscsi_start_ts.delete((u64 *) &cmd); + + return (0); } static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) @@ -99,33 +118,31 @@ static int aggregate_data(iscsi_data_t *data, u64 ts, char *opstr) return 0; } -int iscsi_target_end(struct pt_regs *ctx, struct iscsi_cmd *cmd) +int iscsi_target_end(struct pt_regs *ctx) { - u64 ts = bpf_ktime_get_ns(); - iscsi_data_t *data = iscsi_base_data.lookup((u64 *) &cmd); - u64 delta; - iscsi_key_t key = {}; - char *opstr; - - if (data == 0) { - return 0; // missed issue - } - - if (data->flags & ISCSI_FLAG_CMD_READ) { - aggregate_data(data, ts, READ_STR); - } else if (data->flags & ISCSI_FLAG_CMD_WRITE) { - aggregate_data(data, ts, WRITE_STR); - } - iscsi_base_data.delete((u64 *) &cmd); - - return 0; + u64 ts = bpf_ktime_get_ns(); + u32 tid = bpf_get_current_pid_tgid(); + iscsi_data_t *data = iscsi_base_data.lookup(&tid); + + if (data == 0) { + return (0); // missed issue + } + + if (data->direction == DMA_FROM_DEVICE) { + aggregate_data(data, ts, READ_STR); + } else if (data->direction == DMA_TO_DEVICE) { + aggregate_data(data, ts, WRITE_STR); + } + iscsi_base_data.delete(&tid); + + return (0); } - """ # noqa: W293 b = BPF(text=bpf_text) b.attach_kprobe(event="iscsit_process_scsi_cmd", fn_name="iscsi_target_start") -b.attach_kprobe(event="iscsit_build_rsp_pdu", fn_name="iscsi_target_end") +b.attach_kprobe(event="iscsit_response_queue", fn_name="iscsi_target_response") +b.attach_kretprobe(event="iscsit_response_queue", fn_name="iscsi_target_end") helper = BCCHelper(b, BCCHelper.ANALYTICS_PRINT_MODE) $maps:{map| From fe681cb8e63ebb89762454a76e3212ee13e33210 Mon Sep 17 00:00:00 2001 From: Brad Lewis Date: Thu, 3 Feb 2022 13:15:12 -0700 Subject: [PATCH 34/43] DLPX-79246 - [Backport of DLPX-78891 to 6.0.13.0] stbtrace zpl fails due to uid_t (#78) --- bpf/stbtrace/zpl.st | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bpf/stbtrace/zpl.st b/bpf/stbtrace/zpl.st index c10e918..1b219d2 100644 --- a/bpf/stbtrace/zpl.st +++ b/bpf/stbtrace/zpl.st @@ -28,7 +28,8 @@ bpf_text += """ #include #include -#include +#include +#include // Definitions for this script #define READ_STR "read" @@ -67,9 +68,9 @@ BPF_HASH($hist.name$, zpl_hist_key_t, u64); }$ // Probe functions to initialize thread local data -int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_read_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); @@ -81,9 +82,9 @@ int zfs_read_start(struct pt_regs *ctx, void *inode, uio_t *uio) } // Probe functions to initialize thread local data -int zfs_write_start(struct pt_regs *ctx, void *inode, uio_t *uio) +int zfs_write_start(struct pt_regs *ctx, struct znode *zn, zfs_uio_t *uio, + int flags) { - u32 pid = bpf_get_current_pid_tgid(); zpl_data_t data = {}; data.ts = bpf_ktime_get_ns(); From e20d662b66f645ebb62f82c7a26cacba6fd00209 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Wed, 13 Apr 2022 18:15:09 -0400 Subject: [PATCH 35/43] TOOL-13470 add telegraf dependency to the performance-diagnostic package (#80) --- debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/control b/debian/control index b9d7a06..fb11bac 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python3-minimal, python3-psutil +Depends: python3-bcc, python3-minimal, python3-psutil, telegraf Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. From 8cd3f7a28d08dac751a3abd159acabfa8432ea28 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 15 Apr 2022 18:25:16 -0400 Subject: [PATCH 36/43] TOOL-13515 [Backport of TOOL-13470 to 6.0.14.0] add telegraf dependency to the performance-diagnostic package (#82) --- debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/control b/debian/control index b9d7a06..fb11bac 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python3-minimal, python3-psutil +Depends: python3-bcc, python3-minimal, python3-psutil, telegraf Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. From 9a607809e5182a9a03f4909b45be6e030556c4b1 Mon Sep 17 00:00:00 2001 From: Sebastien Roy Date: Sat, 16 Apr 2022 15:20:11 -0400 Subject: [PATCH 37/43] Revert "TOOL-13515 [Backport of TOOL-13470 to 6.0.14.0] add telegraf dependency to the performance-diagnostic package (#82)" (#84) This reverts commit 8cd3f7a28d08dac751a3abd159acabfa8432ea28. --- debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/control b/debian/control index fb11bac..b9d7a06 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python3-minimal, python3-psutil, telegraf +Depends: python3-bcc, python3-minimal, python3-psutil Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. From 6a20764fca0a538afe2633d20be7a817c211fd60 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Wed, 20 Apr 2022 16:55:16 -0400 Subject: [PATCH 38/43] TOOL-13515 [Backport of TOOL-13470 to 6.0.15.0] add telegraf dependency to the performance-diagnostic package (#85) --- debian/control | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/control b/debian/control index b9d7a06..fb11bac 100644 --- a/debian/control +++ b/debian/control @@ -13,6 +13,6 @@ Standards-Version: 4.1.2 Package: performance-diagnostics Architecture: any -Depends: python3-bcc, python3-minimal, python3-psutil +Depends: python3-bcc, python3-minimal, python3-psutil, telegraf Description: eBPF-based Performance Diagnostic Tools A collection of eBPF-based tools for diagnosing performance issues. From 18023e7bee12b2aa73968c68a64fde3ab7136285 Mon Sep 17 00:00:00 2001 From: Scott MacDonald Date: Fri, 15 Apr 2022 13:56:15 +0100 Subject: [PATCH 39/43] CP-8403 add telegraf-based metric collection Initial addition of configuration and control files to enable performance metric collection using the Telegraf agent. See also IDEA-2835 : Improving Support Bundle Performance Metrics Includes: - Service definition and startup script for "delphix-telegraf" - Modified version of "estat" adding JSON output via a "-j" option - A "perf_playbook" wrapper script to enable/disable enhanced collection - Configuration file sections (combined on startup) - Simple wrappers to facilitate parsing of "nfs_threads", "zpool iostat -o", and "zcache stats -a" outputs The service starts with a "base" set of metrics, but will include Object Storage metrics when it is detected, and will include Performance Playbook commands if that has been enabled (manually). The config is reassembled each startup. File paths intended: /opt/delphix/server/bin/delphix-telegraf-service /lib/systemd/system/delphix-telegraf.service /etc/telegraf/nfs-threads.sh /opt/delphix/server/bin/perf_playbook /etc/telegraf/telegraf.base /etc/telegraf/telegraf.inputs.dose /etc/telegraf/telegraf.inputs.playbook /etc/telegraf/zcache-stats.sh /etc/telegraf/zpool-iostat-o.sh This configuration records 4 output files (rotated on size) for main metrics, aggregate statistics (min,max,mean,stddev) and Playbook outputs to enable independent retention periods. --- cmd/estat.py | 24 +++-- debian/rules | 3 + telegraf/delphix-telegraf-service | 34 +++++++ telegraf/delphix-telegraf.service | 18 ++++ telegraf/nfs-threads.sh | 3 + telegraf/perf_playbook | 53 +++++++++++ telegraf/telegraf.base | 131 ++++++++++++++++++++++++++ telegraf/telegraf.inputs.dose | 46 +++++++++ telegraf/telegraf.inputs.playbook | 149 ++++++++++++++++++++++++++++++ telegraf/zcache-stats.sh | 3 + telegraf/zpool-iostat-o.sh | 3 + 11 files changed, 459 insertions(+), 8 deletions(-) create mode 100755 telegraf/delphix-telegraf-service create mode 100644 telegraf/delphix-telegraf.service create mode 100755 telegraf/nfs-threads.sh create mode 100755 telegraf/perf_playbook create mode 100644 telegraf/telegraf.base create mode 100644 telegraf/telegraf.inputs.dose create mode 100644 telegraf/telegraf.inputs.playbook create mode 100755 telegraf/zcache-stats.sh create mode 100755 telegraf/zpool-iostat-o.sh diff --git a/cmd/estat.py b/cmd/estat.py index ebbc6ab..47d8f00 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -89,6 +89,7 @@ def die(*args, **kwargs): -q/-Q enable/disable latency histograms by size (default: off) -y/-Y enable/disable the summary output (default: on) -t/-T enable/disable emitting the summary total (default: on) + -j set output mode to JSON -d LEVEL set BCC debug level -e emit the resulting eBPF script without executing it @@ -111,7 +112,6 @@ def die(*args, **kwargs): particular the time spent allocating a block and time spent waiting for the write I/O to complete. If POOL is not specified, defaults to tracing the pool 'domain0'. - """ @@ -149,6 +149,7 @@ def usage(msg): script_arg = None debug_level = 0 dump_bpf = False +output_mode = BCCHelper.ESTAT_PRINT_MODE class Args: @@ -161,6 +162,7 @@ class Args: setattr(args, "latsize_hist", False) setattr(args, "summary", True) setattr(args, "total", True) +setattr(args, "json", False) # # We use getopt rather than argparse because it is very difficult to get @@ -170,7 +172,7 @@ class Args: # arguments. # try: - opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e") + opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e") except getopt.GetoptError as err: die(err) @@ -194,6 +196,7 @@ class Args: dump_bpf = True else: switches = {'-l': "lat_hist", + '-j': "json", '-z': "size_hist", '-q': "latsize_hist", '-y': "summary", @@ -219,6 +222,9 @@ class Args: if not (args.lat_hist or args.size_hist or args.latsize_hist): args.lat_hist = True +if args.json: + output_mode = BCCHelper.ANALYTICS_PRINT_MODE + # Now that we are done parsing arguments, construct the text of the BPF program try: with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file: @@ -443,7 +449,7 @@ class Args: probe_type + "'") if args.lat_hist or args.size_hist or args.summary: - helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper1 = BCCHelper(b, output_mode) helper1.add_key_type("name") helper1.add_key_type("axis") @@ -465,7 +471,7 @@ class Args: "bytes") if args.latsize_hist: - helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper2 = BCCHelper(b, output_mode) helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION, "microseconds") helper2.add_key_type("size") @@ -473,15 +479,16 @@ class Args: helper2.add_key_type("axis") if args.summary and args.total: - helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper3 = BCCHelper(b, output_mode) helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)") helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION, "throughput(k/s)") helper3.add_key_type("name") # Need real time; -print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line -print(" Tracing enabled... Hit Ctrl-C to end.") +if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line + print(" Tracing enabled... Hit Ctrl-C to end.") # output if monitor: @@ -508,7 +515,8 @@ class Args: helper1.printall(clear_data) if args.summary and args.total: helper3.printall(clear_data) - print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) except Exception as e: die(e) else: diff --git a/debian/rules b/debian/rules index d1dbb9a..3f8b76c 100755 --- a/debian/rules +++ b/debian/rules @@ -22,3 +22,6 @@ override_dh_auto_install: dh_install build/cmd/* /usr/bin dh_install lib/* /usr/share/performance-diagnostics/lib dh_install bpf/* /usr/share/performance-diagnostics/bpf + dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin + dh_install telegraf/delphix-telegraf.service /lib/systemd/system + dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service new file mode 100755 index 0000000..9b087e2 --- /dev/null +++ b/telegraf/delphix-telegraf-service @@ -0,0 +1,34 @@ +#!/bin/bash +BASE_CONFIG=/etc/telegraf/telegraf.base +DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose +PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf + + +function engine_is_object_based() { + zdb -C | grep "type: 'object_store'" >/dev/null + [[ "$?" == "0" ]] +} + +function playbook_is_enabled() { + [[ -f $PLAYBOOK_FLAG ]] +} + +rm -f $TELEGRAF_CONFIG + +if engine_is_object_based; then + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + fi +else + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $BASE_CONFIG > $TELEGRAF_CONFIG + fi +fi + +/usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/delphix-telegraf.service b/telegraf/delphix-telegraf.service new file mode 100644 index 0000000..6c767e8 --- /dev/null +++ b/telegraf/delphix-telegraf.service @@ -0,0 +1,18 @@ +[Unit] +Description=Delphix Telegraf Metric Collection Agent +Documentation=https://github.com/influxdata/telegraf +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +EnvironmentFile=-/etc/default/telegraf +User=root +ExecStart=/usr/bin/delphix-telegraf-service +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/telegraf/nfs-threads.sh b/telegraf/nfs-threads.sh new file mode 100755 index 0000000..cd530cf --- /dev/null +++ b/telegraf/nfs-threads.sh @@ -0,0 +1,3 @@ +#!/bin/sh +nfs_threads | egrep --line-buffered -v "thr" + diff --git a/telegraf/perf_playbook b/telegraf/perf_playbook new file mode 100755 index 0000000..20542d2 --- /dev/null +++ b/telegraf/perf_playbook @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# +# Script that enables and, disables the Performance Playbook configuration for +# metric collection by Telegraf +# + +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED + +# +# Make sure this can only be run as root. +# +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +# +# Process command. +# + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_playbook() { + date + echo "Enabling Performance Playbook Metrics" + touch $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +function disable_playbook() { + date + echo "Disabling Performance Playbook Metrics" + rm -rf $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_playbook ;; +disable) disable_playbook ;; +*) usage ;; +esac diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base new file mode 100644 index 0000000..41d28fb --- /dev/null +++ b/telegraf/telegraf.base @@ -0,0 +1,131 @@ +# Telegraf Configuration +# +# Configuration for telegraf agent +[agent] + interval = "10s" + round_interval = true + flush_interval = "10s" + metric_batch_size = 1000 + metric_buffer_limit = 10000 + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### +# Define the main metric output file, excluding aggregated stats and +# Performance Playbook (estat) data. +[[outputs.file]] + files = ["/var/log/telegraf/metrics.json"] + rotation_max_size = "50MB" + rotation_max_archives = 9 + data_format = "json" + namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"] + +# Define output file for ZFS related metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_zfs.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["zpool*", "zcache*", "zfs"] + +# Define output file for Performance Playbook (estat) metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_estat.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["*estat_*"] + +# Define output file for aggregate statistics +[[outputs.file]] + files = ["/var/log/telegraf/metric_aggregates.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["agg_*"] + +# Enable Live Monitoring, intended for internal use: +#[[outputs.influxdb]] +# urls = ["http://dbsvr.company.com:8086"] +# database = "live_metrics" +# skip_database_creation = true +# data_format = "influx" + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Get CPU usage +[[inputs.cpu]] + percpu = true + totalcpu = true + collect_cpu_time = false + report_active = false + fieldpass = ["usage*"] + +# Get mount point stats +[[inputs.disk]] + mount_points = ["/","/domain0"] + +# Get disk I/O stats +[[inputs.diskio]] + +# Track stats for the current metric files +[[inputs.filestat]] + files = ["/var/log/telegraf/metrics.json", + "/var/log/telegraf/metrics_estat.json", + "/var/log/telegraf/metrics_zfs.json", + "/var/log/telegraf/metric_aggregates.json"] + +# Get Memory stats +[[inputs.mem]] + +# Get some network interface stats +[[inputs.net]] + fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] + +# Track CPU and Memory for the "delphix-mgmt" service (and children). +[[inputs.procstat]] + systemd_unit = "delphix-mgmt.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Track CPU and Memory for the "zfs-object-agent" service (and children). +[[inputs.procstat]] + systemd_unit = "zfs-object-agent.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Get process counts +[[inputs.processes]] + +# Get swap memory usage +[[inputs.swap]] + +# Get misc 'other' stats (load and uptime) +[[inputs.system]] + +# ZFS kstats (arcstat, abdstat, zfetch, etc) +[[inputs.zfs]] + interval = "1m" + +# Detailed ZFS pool metrics from "zpool_influxdb" (noisy) +#[[inputs.exec]] +# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"] +# data_format = "influx" + +############################################################################### +# AGGREGATION PLUGINS # +############################################################################### +# Filtered aggregate statistics +# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics: +# CPU Usage (%idle) +[[aggregators.basicstats]] + period = "1h" + drop_original = false + stats = ["min", "max", "mean", "stdev"] + name_prefix = "agg_" + namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"] + diff --git a/telegraf/telegraf.inputs.dose b/telegraf/telegraf.inputs.dose new file mode 100644 index 0000000..2d618ce --- /dev/null +++ b/telegraf/telegraf.inputs.dose @@ -0,0 +1,46 @@ +####################### DOSE/zcache Metrics ################################ +[[inputs.execd]] + command = ["/etc/telegraf/zcache-stats.sh"] + name_override = "zcache_stats" + signal = "none" + data_format = "csv" + csv_skip_columns = 1 + csv_column_names = ["cache_lookup_count","idx_access_pendch","idx_access_entry","idx_access_chunk", + "idx_access_disk","cache_hits_count","cache_hits_bytes","cache_hits_ratio", + "cache_insert_count","cache_insert_bytes","insert_source_read","insert_source_write", + "insert_source_specr","insert_drops_buffer","insert_drops_alloc","bufbytes_used_demand", + "bufbytes_used_spec","cache_other_evicts","cache_other_pending","alloc_alloc", + "alloc_avail","alloc_free_space","alloc_free_slabs"] + csv_column_types = ["int","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int","int","int","int","int","int"] + csv_delimiter = "\t" + csv_trim_space = true + + +[[inputs.execd]] + command = ["/etc/telegraf/zpool-iostat-o.sh"] + name_override = "zpool_iostat-o" + signal = "none" + data_format = "csv" + csv_column_names = ["pool","agent_io_op_read","agent_io_op_write","agent_io_tput_read", + "agent_io_tput_write","store_data_op_get","store_data_op_put","store_data_tput_get", + "store_data_tput_put","store_metadata_op_get","store_metadata_op_put", + "store_metadata_tput_get","store_metadata_tput_put","store_reclaim_op_get", + "store_reclaim_op_put","store_reclaim_tput_get","store_reclaim_tput_put","object_del"] + csv_column_types = ["string","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int"] + csv_tag_columns = ["pool"] + csv_delimiter = " " + csv_trim_space = true + + +[[inputs.exec]] + interval = "1h" + commands = ["/usr/sbin/zcache hits --json"] + name_override = "zcache_hits" + data_format = "json" + json_string_fields = ["start_time"] + + +# End of DOSE/zcache section + diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook new file mode 100644 index 0000000..a97f36a --- /dev/null +++ b/telegraf/telegraf.inputs.playbook @@ -0,0 +1,149 @@ +############################################################################## +# Performance Playbook (estat, nfs_threads) collection + +# Collect output from "estat nfs -jm 10" +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat iscsi -jm 10" +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zpl -jm 10" +[[inputs.execd]] + command = ["estat", "zpl", "-jm", "10"] + name_override = "estat_zpl" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat backend-io -jm 10" +[[inputs.execd]] + command = ["estat", "backend-io", "-jm", "10"] + name_override = "estat_backend-io" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zvol -jm 10" +[[inputs.execd]] + command = ["estat", "zvol", "-jm", "10"] + name_override = "estat_zvol" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zio -jm 10" +[[inputs.execd]] + command = ["estat", "zio", "-jm", "10"] + name_override = "estat_zio" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat metaslab-alloc -jm 10" +#[[inputs.execd]] +# command = ["estat", "metaslab-alloc", "-jm", "10"] +# name_override = "estat_metaslab-alloc" +# signal = "none" +# data_format = "json" +# tag_keys = [ +# "name", +# "axis" +# ] +# json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "nfs_threads" +[[inputs.execd]] + command = ["/etc/telegraf/nfs-threads.sh"] + name_override = "nfs_threads" + signal = "none" + data_format = "csv" + csv_skip_columns = 2 + csv_column_names = ["packets","sockets","woken","used","metadata","riops","rtput","wiops","wtput"] + csv_column_types = ["int", "int", "int", "int", "int", "float","string","float","string"] + csv_delimiter = " " + csv_trim_space = true + +# End of Playbook section +############################################################################## + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### +# Convert strings from estat into integer values so they don't get dropped +[[processors.converter]] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# The estat output contains a nested latency histogram, so we need to +# parse that out as a new array metric rather than a non-JSON string. +# +# From this: +# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}" +# to this: +# "microseconds":"{20000:5,30000:15,40000:3,50000:24}" +# +# Clone the original so we have a "new" metric with a "hist_" name prefix +[[processors.clone]] + order = 1 + name_prefix = "hist_" + namepass = ["estat_*"] + +# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects +[[processors.regex]] + order = 2 + namepass = ["hist_estat_*"] + [[processors.regex.fields]] + key = "microseconds" + pattern = "{(\\d+),(\\d+)}" + replacement = "\"${1}\":${2}" + [[processors.regex.fields]] + key = "microseconds" + pattern = ".*" + replacement = "{$0}" + +# Now parse out the arrays for "hist_estat_*" metrics +[[processors.parser]] + order = 3 + merge = "override" + parse_fields = ["microseconds"] + drop_original = false + data_format = "json" + namepass = ["hist_estat_*"] + fieldpass = ["microseconds"] + +# End of Processor section +############################################################################## diff --git a/telegraf/zcache-stats.sh b/telegraf/zcache-stats.sh new file mode 100755 index 0000000..dbe8101 --- /dev/null +++ b/telegraf/zcache-stats.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zcache stats -ap 10 | egrep --line-buffered -v "\-" + diff --git a/telegraf/zpool-iostat-o.sh b/telegraf/zpool-iostat-o.sh new file mode 100755 index 0000000..6f309d4 --- /dev/null +++ b/telegraf/zpool-iostat-o.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zpool iostat -opy domain0 10 | egrep --line-buffered -v "object|put|\-" + From 4c0716dca386a21d53692701a5e3549c22ebc7e2 Mon Sep 17 00:00:00 2001 From: Scott MacDonald <103631931+scottmdlpx@users.noreply.github.com> Date: Tue, 19 Jul 2022 18:16:18 +0100 Subject: [PATCH 40/43] CP-8525 [Backport of CP-8403 to 6.0.16.0] add telegraf-based metric collection (#86) --- cmd/estat.py | 23 +++-- debian/rules | 3 + telegraf/delphix-telegraf-service | 34 +++++++ telegraf/delphix-telegraf.service | 18 ++++ telegraf/nfs-threads.sh | 3 + telegraf/perf_playbook | 53 +++++++++++ telegraf/telegraf.base | 131 ++++++++++++++++++++++++++ telegraf/telegraf.inputs.dose | 46 +++++++++ telegraf/telegraf.inputs.playbook | 149 ++++++++++++++++++++++++++++++ telegraf/zcache-stats.sh | 3 + telegraf/zpool-iostat-o.sh | 3 + 11 files changed, 459 insertions(+), 7 deletions(-) create mode 100755 telegraf/delphix-telegraf-service create mode 100644 telegraf/delphix-telegraf.service create mode 100755 telegraf/nfs-threads.sh create mode 100755 telegraf/perf_playbook create mode 100644 telegraf/telegraf.base create mode 100644 telegraf/telegraf.inputs.dose create mode 100644 telegraf/telegraf.inputs.playbook create mode 100755 telegraf/zcache-stats.sh create mode 100755 telegraf/zpool-iostat-o.sh diff --git a/cmd/estat.py b/cmd/estat.py index 8d64b23..47d8f00 100755 --- a/cmd/estat.py +++ b/cmd/estat.py @@ -89,6 +89,7 @@ def die(*args, **kwargs): -q/-Q enable/disable latency histograms by size (default: off) -y/-Y enable/disable the summary output (default: on) -t/-T enable/disable emitting the summary total (default: on) + -j set output mode to JSON -d LEVEL set BCC debug level -e emit the resulting eBPF script without executing it @@ -148,6 +149,7 @@ def usage(msg): script_arg = None debug_level = 0 dump_bpf = False +output_mode = BCCHelper.ESTAT_PRINT_MODE class Args: @@ -160,6 +162,7 @@ class Args: setattr(args, "latsize_hist", False) setattr(args, "summary", True) setattr(args, "total", True) +setattr(args, "json", False) # # We use getopt rather than argparse because it is very difficult to get @@ -169,7 +172,7 @@ class Args: # arguments. # try: - opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLzZqQyYnNtTd:e") + opts, rem_args = getopt.getopt(sys.argv[2:], "hmMa:lLjzZqQyYnNtTd:e") except getopt.GetoptError as err: die(err) @@ -193,6 +196,7 @@ class Args: dump_bpf = True else: switches = {'-l': "lat_hist", + '-j': "json", '-z': "size_hist", '-q': "latsize_hist", '-y': "summary", @@ -218,6 +222,9 @@ class Args: if not (args.lat_hist or args.size_hist or args.latsize_hist): args.lat_hist = True +if args.json: + output_mode = BCCHelper.ANALYTICS_PRINT_MODE + # Now that we are done parsing arguments, construct the text of the BPF program try: with open(base_dir + 'bpf/estat/' + program + '.c', 'r') as prog_file: @@ -442,7 +449,7 @@ class Args: probe_type + "'") if args.lat_hist or args.size_hist or args.summary: - helper1 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper1 = BCCHelper(b, output_mode) helper1.add_key_type("name") helper1.add_key_type("axis") @@ -464,7 +471,7 @@ class Args: "bytes") if args.latsize_hist: - helper2 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper2 = BCCHelper(b, output_mode) helper2.add_aggregation("latsq", BCCHelper.LL_HISTOGRAM_AGGREGATION, "microseconds") helper2.add_key_type("size") @@ -472,15 +479,16 @@ class Args: helper2.add_key_type("axis") if args.summary and args.total: - helper3 = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) + helper3 = BCCHelper(b, output_mode) helper3.add_aggregation("opst", BCCHelper.COUNT_AGGREGATION, "iops(/s)") helper3.add_aggregation("datat", BCCHelper.SUM_AGGREGATION, "throughput(k/s)") helper3.add_key_type("name") # Need real time; -print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate this line -print(" Tracing enabled... Hit Ctrl-C to end.") +if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) # TODO deduplicate line + print(" Tracing enabled... Hit Ctrl-C to end.") # output if monitor: @@ -507,7 +515,8 @@ class Args: helper1.printall(clear_data) if args.summary and args.total: helper3.printall(clear_data) - print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + if not args.json: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) except Exception as e: die(e) else: diff --git a/debian/rules b/debian/rules index d1dbb9a..3f8b76c 100755 --- a/debian/rules +++ b/debian/rules @@ -22,3 +22,6 @@ override_dh_auto_install: dh_install build/cmd/* /usr/bin dh_install lib/* /usr/share/performance-diagnostics/lib dh_install bpf/* /usr/share/performance-diagnostics/bpf + dh_install telegraf/delphix-telegraf-service telegraf/perf_playbook /usr/bin + dh_install telegraf/delphix-telegraf.service /lib/systemd/system + dh_install telegraf/telegraf* telegraf/*.sh /etc/telegraf diff --git a/telegraf/delphix-telegraf-service b/telegraf/delphix-telegraf-service new file mode 100755 index 0000000..9b087e2 --- /dev/null +++ b/telegraf/delphix-telegraf-service @@ -0,0 +1,34 @@ +#!/bin/bash +BASE_CONFIG=/etc/telegraf/telegraf.base +DOSE_INPUTS=/etc/telegraf/telegraf.inputs.dose +PLAYBOOK_INPUTS=/etc/telegraf/telegraf.inputs.playbook +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED +TELEGRAF_CONFIG=/etc/telegraf/telegraf.conf + + +function engine_is_object_based() { + zdb -C | grep "type: 'object_store'" >/dev/null + [[ "$?" == "0" ]] +} + +function playbook_is_enabled() { + [[ -f $PLAYBOOK_FLAG ]] +} + +rm -f $TELEGRAF_CONFIG + +if engine_is_object_based; then + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $DOSE_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + fi +else + if playbook_is_enabled; then + cat $PLAYBOOK_INPUTS $BASE_CONFIG > $TELEGRAF_CONFIG + else + cat $BASE_CONFIG > $TELEGRAF_CONFIG + fi +fi + +/usr/bin/telegraf -config $TELEGRAF_CONFIG diff --git a/telegraf/delphix-telegraf.service b/telegraf/delphix-telegraf.service new file mode 100644 index 0000000..6c767e8 --- /dev/null +++ b/telegraf/delphix-telegraf.service @@ -0,0 +1,18 @@ +[Unit] +Description=Delphix Telegraf Metric Collection Agent +Documentation=https://github.com/influxdata/telegraf +PartOf=delphix.target +After=delphix-platform.service +PartOf=delphix-platform.service + +[Service] +EnvironmentFile=-/etc/default/telegraf +User=root +ExecStart=/usr/bin/delphix-telegraf-service +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartForceExitStatus=SIGPIPE +KillMode=control-group + +[Install] +WantedBy=delphix.target diff --git a/telegraf/nfs-threads.sh b/telegraf/nfs-threads.sh new file mode 100755 index 0000000..cd530cf --- /dev/null +++ b/telegraf/nfs-threads.sh @@ -0,0 +1,3 @@ +#!/bin/sh +nfs_threads | egrep --line-buffered -v "thr" + diff --git a/telegraf/perf_playbook b/telegraf/perf_playbook new file mode 100755 index 0000000..20542d2 --- /dev/null +++ b/telegraf/perf_playbook @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Copyright (c) 2021 by Delphix. All rights reserved. +# +# Script that enables and, disables the Performance Playbook configuration for +# metric collection by Telegraf +# + +PLAYBOOK_FLAG=/etc/telegraf/PLAYBOOK_ENABLED + +# +# Make sure this can only be run as root. +# +function die() { + echo -e "$(date +%T:%N:%z): $(basename $0): $*" >&2 + exit 1 +} + +[[ $EUID -ne 0 ]] && die "must be run as root" + +# +# Process command. +# + +function usage() { + echo "$(basename $0): $*" >&2 + echo "Usage: $(basename $0) [enable|disable]" + exit 2 +} + +function enable_playbook() { + date + echo "Enabling Performance Playbook Metrics" + touch $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +function disable_playbook() { + date + echo "Disabling Performance Playbook Metrics" + rm -rf $PLAYBOOK_FLAG + systemctl restart delphix-telegraf +} + +if [[ $# -ne 1 ]]; then + usage +fi + +case "$1" in +enable) enable_playbook ;; +disable) disable_playbook ;; +*) usage ;; +esac diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base new file mode 100644 index 0000000..41d28fb --- /dev/null +++ b/telegraf/telegraf.base @@ -0,0 +1,131 @@ +# Telegraf Configuration +# +# Configuration for telegraf agent +[agent] + interval = "10s" + round_interval = true + flush_interval = "10s" + metric_batch_size = 1000 + metric_buffer_limit = 10000 + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### +# Define the main metric output file, excluding aggregated stats and +# Performance Playbook (estat) data. +[[outputs.file]] + files = ["/var/log/telegraf/metrics.json"] + rotation_max_size = "50MB" + rotation_max_archives = 9 + data_format = "json" + namedrop = ["*estat_*", "agg_*", "zfs", "zpool*", "zcache*"] + +# Define output file for ZFS related metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_zfs.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["zpool*", "zcache*", "zfs"] + +# Define output file for Performance Playbook (estat) metrics +[[outputs.file]] + files = ["/var/log/telegraf/metrics_estat.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["*estat_*"] + +# Define output file for aggregate statistics +[[outputs.file]] + files = ["/var/log/telegraf/metric_aggregates.json"] + rotation_max_size = "30MB" + rotation_max_archives = 5 + data_format = "json" + namepass = ["agg_*"] + +# Enable Live Monitoring, intended for internal use: +#[[outputs.influxdb]] +# urls = ["http://dbsvr.company.com:8086"] +# database = "live_metrics" +# skip_database_creation = true +# data_format = "influx" + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Get CPU usage +[[inputs.cpu]] + percpu = true + totalcpu = true + collect_cpu_time = false + report_active = false + fieldpass = ["usage*"] + +# Get mount point stats +[[inputs.disk]] + mount_points = ["/","/domain0"] + +# Get disk I/O stats +[[inputs.diskio]] + +# Track stats for the current metric files +[[inputs.filestat]] + files = ["/var/log/telegraf/metrics.json", + "/var/log/telegraf/metrics_estat.json", + "/var/log/telegraf/metrics_zfs.json", + "/var/log/telegraf/metric_aggregates.json"] + +# Get Memory stats +[[inputs.mem]] + +# Get some network interface stats +[[inputs.net]] + fieldpass = ["tcp*","bytes*","packets*","err*","drop*"] + +# Track CPU and Memory for the "delphix-mgmt" service (and children). +[[inputs.procstat]] + systemd_unit = "delphix-mgmt.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Track CPU and Memory for the "zfs-object-agent" service (and children). +[[inputs.procstat]] + systemd_unit = "zfs-object-agent.service" + include_systemd_children = true + namedrop = ["procstat_lookup"] + fieldpass = ["memory_usage", "cpu_usage", "memory_rss"] + +# Get process counts +[[inputs.processes]] + +# Get swap memory usage +[[inputs.swap]] + +# Get misc 'other' stats (load and uptime) +[[inputs.system]] + +# ZFS kstats (arcstat, abdstat, zfetch, etc) +[[inputs.zfs]] + interval = "1m" + +# Detailed ZFS pool metrics from "zpool_influxdb" (noisy) +#[[inputs.exec]] +# commands = ["/usr/lib/x86_64-linux-gnu/zfs/zpool_influxdb"] +# data_format = "influx" + +############################################################################### +# AGGREGATION PLUGINS # +############################################################################### +# Filtered aggregate statistics +# Calculate Min, Max, Mean, Std Deviation every hour for selected metrics: +# CPU Usage (%idle) +[[aggregators.basicstats]] + period = "1h" + drop_original = false + stats = ["min", "max", "mean", "stdev"] + name_prefix = "agg_" + namepass = ["cpu","disk","diskio","mem","net","processes","system","swap"] + diff --git a/telegraf/telegraf.inputs.dose b/telegraf/telegraf.inputs.dose new file mode 100644 index 0000000..2d618ce --- /dev/null +++ b/telegraf/telegraf.inputs.dose @@ -0,0 +1,46 @@ +####################### DOSE/zcache Metrics ################################ +[[inputs.execd]] + command = ["/etc/telegraf/zcache-stats.sh"] + name_override = "zcache_stats" + signal = "none" + data_format = "csv" + csv_skip_columns = 1 + csv_column_names = ["cache_lookup_count","idx_access_pendch","idx_access_entry","idx_access_chunk", + "idx_access_disk","cache_hits_count","cache_hits_bytes","cache_hits_ratio", + "cache_insert_count","cache_insert_bytes","insert_source_read","insert_source_write", + "insert_source_specr","insert_drops_buffer","insert_drops_alloc","bufbytes_used_demand", + "bufbytes_used_spec","cache_other_evicts","cache_other_pending","alloc_alloc", + "alloc_avail","alloc_free_space","alloc_free_slabs"] + csv_column_types = ["int","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int","int","int","int","int","int"] + csv_delimiter = "\t" + csv_trim_space = true + + +[[inputs.execd]] + command = ["/etc/telegraf/zpool-iostat-o.sh"] + name_override = "zpool_iostat-o" + signal = "none" + data_format = "csv" + csv_column_names = ["pool","agent_io_op_read","agent_io_op_write","agent_io_tput_read", + "agent_io_tput_write","store_data_op_get","store_data_op_put","store_data_tput_get", + "store_data_tput_put","store_metadata_op_get","store_metadata_op_put", + "store_metadata_tput_get","store_metadata_tput_put","store_reclaim_op_get", + "store_reclaim_op_put","store_reclaim_tput_get","store_reclaim_tput_put","object_del"] + csv_column_types = ["string","int","int","int","int","int","int","int","int","int","int","int","int", + "int","int","int","int","int"] + csv_tag_columns = ["pool"] + csv_delimiter = " " + csv_trim_space = true + + +[[inputs.exec]] + interval = "1h" + commands = ["/usr/sbin/zcache hits --json"] + name_override = "zcache_hits" + data_format = "json" + json_string_fields = ["start_time"] + + +# End of DOSE/zcache section + diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook new file mode 100644 index 0000000..a97f36a --- /dev/null +++ b/telegraf/telegraf.inputs.playbook @@ -0,0 +1,149 @@ +############################################################################## +# Performance Playbook (estat, nfs_threads) collection + +# Collect output from "estat nfs -jm 10" +[[inputs.execd]] + command = ["estat", "nfs", "-jm", "10"] + name_override = "estat_nfs" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat iscsi -jm 10" +[[inputs.execd]] + command = ["estat", "iscsi", "-jm", "10"] + name_override = "estat_iscsi" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zpl -jm 10" +[[inputs.execd]] + command = ["estat", "zpl", "-jm", "10"] + name_override = "estat_zpl" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat backend-io -jm 10" +[[inputs.execd]] + command = ["estat", "backend-io", "-jm", "10"] + name_override = "estat_backend-io" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zvol -jm 10" +[[inputs.execd]] + command = ["estat", "zvol", "-jm", "10"] + name_override = "estat_zvol" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat zio -jm 10" +[[inputs.execd]] + command = ["estat", "zio", "-jm", "10"] + name_override = "estat_zio" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "estat metaslab-alloc -jm 10" +#[[inputs.execd]] +# command = ["estat", "metaslab-alloc", "-jm", "10"] +# name_override = "estat_metaslab-alloc" +# signal = "none" +# data_format = "json" +# tag_keys = [ +# "name", +# "axis" +# ] +# json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + +# Collect output from "nfs_threads" +[[inputs.execd]] + command = ["/etc/telegraf/nfs-threads.sh"] + name_override = "nfs_threads" + signal = "none" + data_format = "csv" + csv_skip_columns = 2 + csv_column_names = ["packets","sockets","woken","used","metadata","riops","rtput","wiops","wtput"] + csv_column_types = ["int", "int", "int", "int", "int", "float","string","float","string"] + csv_delimiter = " " + csv_trim_space = true + +# End of Playbook section +############################################################################## + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### +# Convert strings from estat into integer values so they don't get dropped +[[processors.converter]] + [processors.converter.fields] + integer = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)"] + +# The estat output contains a nested latency histogram, so we need to +# parse that out as a new array metric rather than a non-JSON string. +# +# From this: +# "microseconds":"{20000,5},{30000,15},{40000,3},{50000,24}" +# to this: +# "microseconds":"{20000:5,30000:15,40000:3,50000:24}" +# +# Clone the original so we have a "new" metric with a "hist_" name prefix +[[processors.clone]] + order = 1 + name_prefix = "hist_" + namepass = ["estat_*"] + +# Rewrite the histograms for the "hist_estat_*" metrics as JSON objects +[[processors.regex]] + order = 2 + namepass = ["hist_estat_*"] + [[processors.regex.fields]] + key = "microseconds" + pattern = "{(\\d+),(\\d+)}" + replacement = "\"${1}\":${2}" + [[processors.regex.fields]] + key = "microseconds" + pattern = ".*" + replacement = "{$0}" + +# Now parse out the arrays for "hist_estat_*" metrics +[[processors.parser]] + order = 3 + merge = "override" + parse_fields = ["microseconds"] + drop_original = false + data_format = "json" + namepass = ["hist_estat_*"] + fieldpass = ["microseconds"] + +# End of Processor section +############################################################################## diff --git a/telegraf/zcache-stats.sh b/telegraf/zcache-stats.sh new file mode 100755 index 0000000..dbe8101 --- /dev/null +++ b/telegraf/zcache-stats.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zcache stats -ap 10 | egrep --line-buffered -v "\-" + diff --git a/telegraf/zpool-iostat-o.sh b/telegraf/zpool-iostat-o.sh new file mode 100755 index 0000000..6f309d4 --- /dev/null +++ b/telegraf/zpool-iostat-o.sh @@ -0,0 +1,3 @@ +#!/bin/bash +zpool iostat -opy domain0 10 | egrep --line-buffered -v "object|put|\-" + From ecf228de705acc861084864e04fc5af074ccc039 Mon Sep 17 00:00:00 2001 From: Scott MacDonald Date: Fri, 19 Aug 2022 17:47:11 +0100 Subject: [PATCH 41/43] DLPX-82298 Telegraf needs a restart delay for external programs (#90) DLPX-82298 Telegraf needs a restart delay for external programs Added a 30s restart delay to external commads invoked via the "execd" plugin --- telegraf/perf_playbook | 4 ++-- telegraf/telegraf.inputs.dose | 2 ++ telegraf/telegraf.inputs.playbook | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/telegraf/perf_playbook b/telegraf/perf_playbook index 20542d2..8920963 100755 --- a/telegraf/perf_playbook +++ b/telegraf/perf_playbook @@ -30,14 +30,14 @@ function usage() { function enable_playbook() { date - echo "Enabling Performance Playbook Metrics" + echo "Enabling Performance Playbook Metric Collection" touch $PLAYBOOK_FLAG systemctl restart delphix-telegraf } function disable_playbook() { date - echo "Disabling Performance Playbook Metrics" + echo "Disabling Performance Playbook Metric Collection" rm -rf $PLAYBOOK_FLAG systemctl restart delphix-telegraf } diff --git a/telegraf/telegraf.inputs.dose b/telegraf/telegraf.inputs.dose index 2d618ce..408018e 100644 --- a/telegraf/telegraf.inputs.dose +++ b/telegraf/telegraf.inputs.dose @@ -3,6 +3,7 @@ command = ["/etc/telegraf/zcache-stats.sh"] name_override = "zcache_stats" signal = "none" + restart_delay = "30s" data_format = "csv" csv_skip_columns = 1 csv_column_names = ["cache_lookup_count","idx_access_pendch","idx_access_entry","idx_access_chunk", @@ -21,6 +22,7 @@ command = ["/etc/telegraf/zpool-iostat-o.sh"] name_override = "zpool_iostat-o" signal = "none" + restart_delay = "30s" data_format = "csv" csv_column_names = ["pool","agent_io_op_read","agent_io_op_write","agent_io_tput_read", "agent_io_tput_write","store_data_op_get","store_data_op_put","store_data_tput_get", diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index a97f36a..43fcccc 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -6,6 +6,7 @@ command = ["estat", "nfs", "-jm", "10"] name_override = "estat_nfs" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -18,6 +19,7 @@ command = ["estat", "iscsi", "-jm", "10"] name_override = "estat_iscsi" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -30,6 +32,7 @@ command = ["estat", "zpl", "-jm", "10"] name_override = "estat_zpl" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -42,6 +45,7 @@ command = ["estat", "backend-io", "-jm", "10"] name_override = "estat_backend-io" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -54,6 +58,7 @@ command = ["estat", "zvol", "-jm", "10"] name_override = "estat_zvol" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -66,6 +71,7 @@ command = ["estat", "zio", "-jm", "10"] name_override = "estat_zio" signal = "none" + restart_delay = "30s" data_format = "json" tag_keys = [ "name", @@ -78,6 +84,7 @@ # command = ["estat", "metaslab-alloc", "-jm", "10"] # name_override = "estat_metaslab-alloc" # signal = "none" +# restart_delay = "30s" # data_format = "json" # tag_keys = [ # "name", @@ -90,6 +97,7 @@ command = ["/etc/telegraf/nfs-threads.sh"] name_override = "nfs_threads" signal = "none" + restart_delay = "30s" data_format = "csv" csv_skip_columns = 2 csv_column_names = ["packets","sockets","woken","used","metadata","riops","rtput","wiops","wtput"] From 1340e416d9436a1d3faf72e54a40d903be372c19 Mon Sep 17 00:00:00 2001 From: Scott MacDonald Date: Fri, 19 Aug 2022 17:57:04 +0100 Subject: [PATCH 42/43] DLPX-82457 Telegraf: add zio-queue and metaslab-alloc to playbook collection (#91) DLPX-82457 Telegraf: add zio-queue and metaslab-alloc to playbook collection Added entries for these two commands --- telegraf/telegraf.base | 2 +- telegraf/telegraf.inputs.playbook | 34 +++++++++++++++++++++---------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 41d28fb..75526b2 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -44,7 +44,7 @@ data_format = "json" namepass = ["agg_*"] -# Enable Live Monitoring, intended for internal use: +# Enable Live Monitoring, intended for internal Delphix use only: #[[outputs.influxdb]] # urls = ["http://dbsvr.company.com:8086"] # database = "live_metrics" diff --git a/telegraf/telegraf.inputs.playbook b/telegraf/telegraf.inputs.playbook index 43fcccc..5ed7e21 100644 --- a/telegraf/telegraf.inputs.playbook +++ b/telegraf/telegraf.inputs.playbook @@ -66,6 +66,18 @@ ] json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] +# Collect output from "estat zio-queue -jm 10" +[[inputs.execd]] + command = ["estat", "zio-queue", "-jm", "10"] + name_override = "estat_zio-queue" + signal = "none" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] + # Collect output from "estat zio -jm 10" [[inputs.execd]] command = ["estat", "zio", "-jm", "10"] @@ -80,17 +92,17 @@ json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] # Collect output from "estat metaslab-alloc -jm 10" -#[[inputs.execd]] -# command = ["estat", "metaslab-alloc", "-jm", "10"] -# name_override = "estat_metaslab-alloc" -# signal = "none" -# restart_delay = "30s" -# data_format = "json" -# tag_keys = [ -# "name", -# "axis" -# ] -# json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] +[[inputs.execd]] + command = ["estat", "metaslab-alloc", "-jm", "10"] + name_override = "estat_metaslab-alloc" + signal = "none" + restart_delay = "30s" + data_format = "json" + tag_keys = [ + "name", + "axis" + ] + json_string_fields = ["iops(/s)", "avg latency(us)", "stddev(us)", "throughput(k/s)", "microseconds"] # Collect output from "nfs_threads" [[inputs.execd]] From 07c67095f14981fd3656909425d569cd8d51d462 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Tue, 28 Mar 2023 12:16:06 -0700 Subject: [PATCH 43/43] XXX Log to perfdb.dcol1.delphix.com by default PR URL: https://www.github.com/delphix/performance-diagnostics/pull/93 --- telegraf/telegraf.base | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/telegraf/telegraf.base b/telegraf/telegraf.base index 75526b2..a0a1920 100644 --- a/telegraf/telegraf.base +++ b/telegraf/telegraf.base @@ -45,11 +45,11 @@ namepass = ["agg_*"] # Enable Live Monitoring, intended for internal Delphix use only: -#[[outputs.influxdb]] -# urls = ["http://dbsvr.company.com:8086"] -# database = "live_metrics" -# skip_database_creation = true -# data_format = "influx" +[[outputs.influxdb]] + urls = ["http://perfdb.dcol1.delphix.com:8086"] + database = "live_metrics" + skip_database_creation = true + data_format = "influx" ############################################################################### # INPUT PLUGINS #