diff --git a/.github/workflows/alpinelinux.yaml b/.github/workflows/alpinelinux.yaml index b143c398b34..a1c049b9b31 100644 --- a/.github/workflows/alpinelinux.yaml +++ b/.github/workflows/alpinelinux.yaml @@ -54,6 +54,7 @@ jobs: run: | cmake -B build -G Ninja \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DSeastar_USE_OPENSSL=yes \ -DSeastar_DOCS=OFF - name: Build Seastar diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 69aa186a70f..e69a953faa6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -33,6 +33,11 @@ on: type: string default: '' required: false + crypto_provider: + description: 'cryptographic provider to use' + type: string + default: 'GnuTLS' + required: false jobs: test: @@ -81,14 +86,15 @@ jobs: if ${{ inputs.enable-ccache }}; then MAYBE_CCACHE_OPT="--ccache" fi - ./configure.py \ - --c++-standard ${{ inputs.standard }} \ - --compiler ${{ inputs.compiler }} \ - --c-compiler $CC \ - --mode ${{ inputs.mode }} \ - $MAYBE_CCACHE_OPT \ - ${{ inputs.options }} \ - ${{ inputs.enables }} + ./configure.py \ + --c++-standard ${{ inputs.standard }} \ + --compiler ${{ inputs.compiler }} \ + --c-compiler $CC \ + --mode ${{ inputs.mode }} \ + $MAYBE_CCACHE_OPT \ + ${{ inputs.options }} \ + ${{ inputs.enables }} \ + --crypto-provider ${{ inputs.crypto_provider }} - name: Build run: cmake --build build/${{inputs.mode}} diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 91e23d8e3c9..eb91eb0d501 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -11,7 +11,7 @@ concurrency: jobs: regular_test: - name: "Test (${{ matrix.compiler }}, C++${{ matrix.standard}}, ${{ matrix.mode }})" + name: "Test (${{ matrix.compiler }}, C++${{ matrix.standard}}, ${{ matrix.mode }}, ${{ matrix.crypto_provider }})" uses: ./.github/workflows/test.yaml strategy: fail-fast: false @@ -19,12 +19,14 @@ jobs: compiler: [clang++, g++] standard: [20, 23] mode: [dev, debug, release] + crypto_provider: [GnuTLS, OpenSSL] with: compiler: ${{ matrix.compiler }} standard: ${{ matrix.standard }} mode: ${{ matrix.mode }} enables: ${{ matrix.enables }} options: ${{ matrix.options }} + crypto_provider: ${{ matrix.crypto_provider }} build_with_dpdk: name: "Test with DPDK enabled" uses: ./.github/workflows/test.yaml @@ -36,8 +38,8 @@ jobs: mode: release enables: --enable-dpdk options: --cook dpdk - build_with_cxx_modules: - name: "Test with C++20 modules enabled" + build_with_cxx_modules_gnutls: + name: "Test with C++20 modules enabled (GnuTLS)" uses: ./.github/workflows/test.yaml strategy: fail-fast: false @@ -47,3 +49,22 @@ jobs: mode: debug enables: --enable-cxx-modules enable-ccache: false + crypto_provider: GnuTLS + # disable modules build as we aren't using module and it is quite + # broken at the moment + if: false + build_with_cxx_modules_openssl: + name: "Test with C++20 modules enabled (OpenSSL)" + uses: ./.github/workflows/test.yaml + strategy: + fail-fast: false + with: + compiler: clang++ + standard: 23 + mode: debug + enables: --enable-cxx-modules + enable-ccache: false + crypto_provider: OpenSSL + # disable modules build as we aren't using module and it is quite + # broken at the moment + if: false diff --git a/CMakeLists.txt b/CMakeLists.txt index 982a9ba6a69..97cb3b2647e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,16 @@ if (NOT Seastar_SCHEDULING_GROUPS_COUNT MATCHES "^[1-9][0-9]*") message(FATAL_ERROR "Seastar_SCHEDULING_GROUPS_COUNT must be a positive number (${Seastar_SCHEDULING_GROUPS_COUNT})") endif () +option (Seastar_USE_OPENSSL + "Use OpenSSL rather than GnuTLS for cryptographic operations, including TLS" + OFF) + +if (Seastar_USE_OPENSSL) + set (Seastar_USE_GNUTLS OFF) +else () + set (Seastar_USE_GNUTLS ON) +endif () + # # Add a dev build type. # @@ -677,6 +687,7 @@ add_library (seastar src/core/reactor_backend.cc src/core/thread_pool.cc src/core/app-template.cc + src/core/cpu_profiler.cc src/core/disk_params.cc src/core/dpdk_rte.cc src/core/exception_hacks.cc @@ -709,6 +720,7 @@ add_library (seastar src/core/io_queue.cc src/core/semaphore.cc src/core/condition-variable.cc + src/core/signal_mutex.cc src/http/api_docs.cc src/http/common.cc src/http/file_handler.cc @@ -736,13 +748,16 @@ add_library (seastar src/net/native-stack-impl.hh src/net/native-stack.cc src/net/net.cc + $<$:src/net/ossl.cc> src/net/packet.cc src/net/posix-stack.cc src/net/proxy.cc src/net/socket_address.cc src/net/stack.cc src/net/tcp.cc - src/net/tls.cc + $<$:src/net/tls.cc> + src/net/tls-impl.cc + src/net/tls-impl.hh src/net/udp.cc src/net/unix_address.cc src/net/virtio.cc @@ -840,7 +855,9 @@ target_link_libraries (seastar SourceLocation::source_location PRIVATE ${CMAKE_DL_LIBS} - GnuTLS::gnutls + $<$:GnuTLS::gnutls> + $<$:OpenSSL::SSL> + $<$:OpenSSL::Crypto> StdAtomic::atomic lksctp-tools::lksctp-tools protobuf::libprotobuf @@ -892,6 +909,8 @@ include (CTest) target_compile_definitions(seastar PUBLIC SEASTAR_API_LEVEL=${Seastar_API_LEVEL} + $<$:SEASTAR_USE_OPENSSL> + $<$:SEASTAR_USE_GNUTLS> $<$:SEASTAR_BUILD_SHARED_LIBS>) target_compile_features(seastar diff --git a/cmake/SeastarDependencies.cmake b/cmake/SeastarDependencies.cmake index 2335a12d234..8731f306935 100644 --- a/cmake/SeastarDependencies.cmake +++ b/cmake/SeastarDependencies.cmake @@ -88,7 +88,12 @@ macro (seastar_find_dependencies) endif() seastar_find_dep (fmt 8.1.1 REQUIRED) seastar_find_dep (lz4 1.7.3 REQUIRED) - seastar_find_dep (GnuTLS 3.3.26 REQUIRED) + if (Seastar_USE_GNUTLS) + seastar_find_dep (GnuTLS 3.3.26 REQUIRED) + endif() + if (Seastar_USE_OPENSSL) + seastar_find_dep (OpenSSL 3.0.0 REQUIRED) + endif() if (Seastar_IO_URING) seastar_find_dep (LibUring 2.0 REQUIRED) endif() diff --git a/configure.py b/configure.py index c0b284bc170..68ccdebfa1e 100755 --- a/configure.py +++ b/configure.py @@ -89,6 +89,9 @@ def standard_supported(standard, compiler='g++'): arg_parser.add_argument('--verbose', dest='verbose', action='store_true', help='Make configure output more verbose.') arg_parser.add_argument('--scheduling-groups-count', action='store', dest='scheduling_groups_count', default='16', help='Number of available scheduling groups in the reactor') +arg_parser.add_argument('--crypto-provider', dest='crypto_provider', choices=seastar_cmake.SUPPORTED_CRYPTO_PROVIDERS, + default='GnuTLS', help='The cryptographic provider ot use') +arg_parser.add_argument('--openssl-root-dir', dest='openssl_root_dir', help="Root directory for OpenSSL library") add_tristate( arg_parser, @@ -191,6 +194,7 @@ def configure_mode(mode): '-DBUILD_SHARED_LIBS={}'.format('yes' if mode in ('debug', 'dev') else 'no'), '-DSeastar_API_LEVEL={}'.format(args.api_level), '-DSeastar_SCHEDULING_GROUPS_COUNT={}'.format(args.scheduling_groups_count), + '-DSeastar_USE_OPENSSL={}'.format('yes' if args.crypto_provider == 'OpenSSL' else 'no'), tr(args.exclude_tests, 'EXCLUDE_TESTS_FROM_ALL'), tr(args.exclude_apps, 'EXCLUDE_APPS_FROM_ALL'), tr(args.exclude_demos, 'EXCLUDE_DEMOS_FROM_ALL'), @@ -211,6 +215,9 @@ def configure_mode(mode): tr(args.debug_shared_ptr, 'DEBUG_SHARED_PTR', value_when_none='default'), ] + if args.openssl_root_dir is not None: + TRANSLATED_ARGS.appen(f'-DOPENSSL_ROOT_DIR={args.openssl_root_dir}') + ingredients_to_cook = set(args.cook) if args.dpdk: diff --git a/demos/tls_echo_server.hh b/demos/tls_echo_server.hh index e7185dac95e..4c633b1c520 100644 --- a/demos/tls_echo_server.hh +++ b/demos/tls_echo_server.hh @@ -46,70 +46,86 @@ class echoserver { seastar::gate _gate; bool _stopped = false; bool _verbose = false; + + future run_once() { + if (_stopped) { + return make_ready_future(stop_iteration::yes); + } + return with_gate(_gate, [this] { + return _socket.accept().then([this](accept_result ar) { + ::connected_socket s = std::move(ar.connection); + socket_address a = std::move(ar.remote_address); + if (_verbose) { + std::cout << "Got connection from "<< a << std::endl; + } + auto strms = make_lw_shared(std::move(s)); + return repeat([strms, this]() { + return strms->in.read().then([this, strms](temporary_buffer buf) { + if (buf.empty()) { + if (_verbose) { + std::cout << "EOM" << std::endl; + } + return make_ready_future(stop_iteration::yes); + } + sstring tmp(buf.begin(), buf.end()); + if (_verbose) { + std::cout << "Read " << tmp.size() << "B" << std::endl; + } + return strms->out.write(tmp).then([strms]() { + return strms->out.flush(); + }).then([] { + return make_ready_future(stop_iteration::no); + }); + }); + }).then([strms]{ + return strms->out.close(); + }).handle_exception([](auto ep) { + std::cout << "Exception: " << ep << std::endl; + }).finally([this, strms]{ + if (_verbose) { + std::cout << "Ending session" << std::endl; + } + return strms->in.close(); + }); + }).handle_exception([this](auto ep) { + if (!_stopped) { + std::cerr << "Error: " << ep << std::endl; + } + }).then([this] { + return make_ready_future(_stopped ? stop_iteration::yes : stop_iteration::no); + }); + }); + } public: echoserver(bool verbose = false) : _certs(make_shared(make_shared())) , _verbose(verbose) {} - future<> listen(socket_address addr, sstring crtfile, sstring keyfile, tls::client_auth ca = tls::client_auth::NONE) { - _certs->set_client_auth(ca); - return _certs->set_x509_key_file(crtfile, keyfile, tls::x509_crt_format::PEM).then([this, addr] { - ::listen_options opts; - opts.reuse_address = true; + future<> listen(socket_address addr, sstring crtfile, sstring keyfile, sstring cafile) { + _certs->set_dn_verification_callback([](seastar::tls::session_type, sstring subject, sstring issuer){ + std::cout << "DN Verification callback, subject: " << subject << " issuer: " << issuer << std::endl; + }); + auto f = make_ready_future(); + auto cauth = tls::client_auth::NONE; + if (cafile != "") { + cauth = tls::client_auth::REQUIRE; + f = _certs->set_x509_trust_file(cafile, tls::x509_crt_format::PEM); + } + _certs->set_client_auth(cauth); + return f.then([this, addr, crtfile, keyfile] { + return _certs->set_x509_key_file(crtfile, keyfile, tls::x509_crt_format::PEM).then([this, addr] { + ::listen_options opts; + opts.reuse_address = true; - _socket = tls::listen(_certs, addr, opts); + _socket = tls::listen(_certs, addr, opts); - // Listen in background. - (void)repeat([this] { - if (_stopped) { - return make_ready_future(stop_iteration::yes); - } - return with_gate(_gate, [this] { - return _socket.accept().then([this](accept_result ar) { - ::connected_socket s = std::move(ar.connection); - socket_address a = std::move(ar.remote_address); - if (_verbose) { - std::cout << "Got connection from "<< a << std::endl; - } - auto strms = make_lw_shared(std::move(s)); - return repeat([strms, this]() { - return strms->in.read().then([this, strms](temporary_buffer buf) { - if (buf.empty()) { - if (_verbose) { - std::cout << "EOM" << std::endl; - } - return make_ready_future(stop_iteration::yes); - } - sstring tmp(buf.begin(), buf.end()); - if (_verbose) { - std::cout << "Read " << tmp.size() << "B" << std::endl; - } - return strms->out.write(tmp).then([strms]() { - return strms->out.flush(); - }).then([] { - return make_ready_future(stop_iteration::no); - }); - }); - }).then([strms]{ - return strms->out.close(); - }).handle_exception([](auto ep) { - }).finally([this, strms]{ - if (_verbose) { - std::cout << "Ending session" << std::endl; - } - return strms->in.close(); - }); - }).handle_exception([this](auto ep) { - if (!_stopped) { - std::cerr << "Error: " << ep << std::endl; - } - }).then([this] { - return make_ready_future(_stopped ? stop_iteration::yes : stop_iteration::no); + // Listen in background. + (void)repeat([this] { + return run_once(); }); - }); + return make_ready_future(); }); - return make_ready_future(); }); } diff --git a/demos/tls_echo_server_demo.cc b/demos/tls_echo_server_demo.cc index 4c445fea973..9611f5e8b06 100644 --- a/demos/tls_echo_server_demo.cc +++ b/demos/tls_echo_server_demo.cc @@ -37,6 +37,7 @@ int main(int ac, char** av) { app.add_options() ("port", bpo::value()->default_value(10000), "Server port") ("address", bpo::value()->default_value("127.0.0.1"), "Server address") + ("ca,a", bpo::value()->default_value(""), "Server CA chain file") ("cert,c", bpo::value()->required(), "Server certificate file") ("key,k", bpo::value()->required(), "Certificate key") ("verbose,v", bpo::value()->default_value(false)->implicit_value(true), "Verbose") @@ -46,6 +47,7 @@ int main(int ac, char** av) { seastar_apps_lib::stop_signal stop_signal; auto&& config = app.configuration(); uint16_t port = config["port"].as(); + auto ca = config["ca"].as(); auto crt = config["cert"].as(); auto key = config["key"].as(); auto addr = config["address"].as(); @@ -61,7 +63,7 @@ int main(int ac, char** av) { auto stop_server = deferred_stop(server); try { - server.invoke_on_all(&echoserver::listen, socket_address(ia), sstring(crt), sstring(key), tls::client_auth::NONE).get(); + server.invoke_on_all(&echoserver::listen, socket_address(ia), sstring(crt), sstring(key),sstring(ca)).get(); } catch (...) { std::cerr << "Error: " << std::current_exception() << std::endl; return 1; diff --git a/demos/tls_simple_client_demo.cc b/demos/tls_simple_client_demo.cc index 0f93ec6e606..5a425271776 100644 --- a/demos/tls_simple_client_demo.cc +++ b/demos/tls_simple_client_demo.cc @@ -39,6 +39,8 @@ int main(int ac, char** av) { ("port", bpo::value()->default_value(10000), "Remote port") ("address", bpo::value()->default_value("127.0.0.1"), "Remote address") ("trust,t", bpo::value(), "Trust store") + ("certificate", bpo::value(), "Certficiate") + ("key,k", bpo::value(), "Private Keyfile") ("msg,m", bpo::value(), "Message to send") ("bytes,b", bpo::value()->default_value(512), "Use random bytes of length as message") ("iterations,i", bpo::value()->default_value(1), "Repeat X times") @@ -68,6 +70,15 @@ int main(int ac, char** av) { f = certs->set_x509_trust_file(config["trust"].as(), tls::x509_crt_format::PEM); } + if (config.count("certificate") && config.count("key")) { + f = f.then([certs, + cert = config["certificate"].as(), + key = config["key"].as()]{ + return certs->set_x509_key_file(cert, key, tls::x509_crt_format::PEM); + }); + } + + seastar::shared_ptr msg; if (config.count("msg")) { diff --git a/include/seastar/core/circular_buffer_fixed_capacity.hh b/include/seastar/core/circular_buffer_fixed_capacity.hh index 6c8871e66a7..2671a7feb35 100644 --- a/include/seastar/core/circular_buffer_fixed_capacity.hh +++ b/include/seastar/core/circular_buffer_fixed_capacity.hh @@ -30,6 +30,7 @@ #ifndef SEASTAR_MODULE #include +#include #include #include #include diff --git a/include/seastar/core/deleter.hh b/include/seastar/core/deleter.hh index 8957c578cda..5a7945d67d1 100644 --- a/include/seastar/core/deleter.hh +++ b/include/seastar/core/deleter.hh @@ -22,6 +22,7 @@ #pragma once #ifndef SEASTAR_MODULE +#include #include #include #include @@ -30,7 +31,15 @@ #include #endif +// The forward declarations of classes below are used for +// friending by the deleter. +struct test_deleter_append_does_not_free_shared_object; +struct test_deleter_append_same_shared_object_twice; + namespace seastar { +namespace net { + class packet; +}; /// \addtogroup memory-module /// @{ @@ -86,11 +95,19 @@ public: this->~deleter(); new (this) deleter(i); } +private: /// \endcond /// Appends another deleter to this deleter. When this deleter is /// destroyed, both encapsulated actions will be carried out. + /// + /// This operation is not thread-safe and therefore not made public + /// except for a few manually verified uses that are marked as freinds + /// below. void append(deleter d); -private: + friend class ::seastar::net::packet; + friend struct ::test_deleter_append_does_not_free_shared_object; + friend struct ::test_deleter_append_same_shared_object_twice; + static bool is_raw_object(impl* i) noexcept { auto x = reinterpret_cast(i); return x & 1; @@ -113,7 +130,9 @@ private: /// \cond internal struct deleter::impl { - unsigned refs = 1; + // The memory ordering on operations to this counter is similar to + // std::shared_ptr. + std::atomic refs = 1; deleter next; impl(deleter next) : next(std::move(next)) {} virtual ~impl() {} @@ -126,7 +145,7 @@ deleter::~deleter() { std::free(to_raw_object()); return; } - if (_impl && --_impl->refs == 0) { + if (_impl && _impl->refs.fetch_sub(1, std::memory_order_acq_rel) == 1) { delete _impl; } } @@ -209,7 +228,7 @@ deleter::share() { if (is_raw_object()) { _impl = new free_deleter_impl(to_raw_object()); } - ++_impl->refs; + _impl->refs.fetch_add(1, std::memory_order_relaxed); return deleter(_impl); } diff --git a/include/seastar/core/disk_params.hh b/include/seastar/core/disk_params.hh index dfbf5aaef0c..e3f2b8d01dc 100644 --- a/include/seastar/core/disk_params.hh +++ b/include/seastar/core/disk_params.hh @@ -47,6 +47,7 @@ struct disk_params { uint64_t write_saturation_length = std::numeric_limits::max(); bool duplex = false; float rate_factor = 1.0; + bool max_cost_function = true; }; SEASTAR_MODULE_EXPORT diff --git a/include/seastar/core/internal/cpu_profiler.hh b/include/seastar/core/internal/cpu_profiler.hh new file mode 100644 index 00000000000..7fb369cb1f6 --- /dev/null +++ b/include/seastar/core/internal/cpu_profiler.hh @@ -0,0 +1,182 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2023 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace seastar { + +class reactor; + +struct cpu_profiler_trace { + using kernel_trace_vec = boost::container::static_vector; + simple_backtrace user_backtrace; + kernel_trace_vec kernel_backtrace; + // The scheduling group active at the time the same was taken. Note that + // non-task reactor work (such as polling) ends up the in the default + // scheduling group (with name "main"). + scheduling_group sg; +}; + +constexpr size_t max_number_of_traces = 128; + +namespace internal { + +// Temporarily enable/disable the CPU profiler from taking stacktraces on this thread, +// but don't disable the profiler completely. This can be used disable the profiler +// for cases when taking a backtrace isn't valid (IE JIT generated code). +void profiler_drop_stacktraces(bool) noexcept; + +// A small RAII object to disable profiling temporarily +// +// This is not reentrant. +class scoped_disable_profile_temporarily { +public: + scoped_disable_profile_temporarily() noexcept { + profiler_drop_stacktraces(true); + } + ~scoped_disable_profile_temporarily() noexcept { + profiler_drop_stacktraces(false); + } +}; + +struct cpu_profiler_config { + bool enabled; + std::chrono::nanoseconds period; +}; + +struct cpu_profiler_stats { + unsigned dropped_samples_from_manual_disablement{0}; + unsigned dropped_samples_from_exceptions{0}; + unsigned dropped_samples_from_buffer_full{0}; + unsigned dropped_samples_from_mutex_contention{0}; + + void clear_dropped() { + dropped_samples_from_manual_disablement = 0; + dropped_samples_from_exceptions = 0; + dropped_samples_from_buffer_full = 0; + dropped_samples_from_mutex_contention = 0; + } + + unsigned sum_dropped() const { + return dropped_samples_from_manual_disablement + + dropped_samples_from_buffer_full + + dropped_samples_from_exceptions + + dropped_samples_from_mutex_contention; + } +}; + +class cpu_profiler { +private: + circular_buffer_fixed_capacity _traces; + // The operations in `_traces` are not reentrant. Therefore mutex is used to ensure + // that an interrupt cannot access `_traces` if the interrupted thread was already + // accessing it. + signal_mutex _traces_mutex; + cpu_profiler_config _cfg; + std::chrono::nanoseconds _last_set_timeout; + cpu_profiler_stats _stats; + bool _is_stopped{true}; + + + bool is_enabled() const; + std::chrono::nanoseconds period() const; + std::chrono::nanoseconds get_next_timeout(); + +protected: + friend reactor; + +public: + static int signal_number() { return SIGRTMIN + 2; } + + cpu_profiler(cpu_profiler_config cfg) : _cfg(cfg) {} + + // Allows for the sampling period of the profiler to be adjusted + // and the profiler to be enabled and disabled. + void update_config(cpu_profiler_config cfg); + // Stops the profiler if running and prevents it from starting until + // `start()` is explicitly called. + void stop(); + // Allows to profiler to run when it's enabled via the `cpu_profiler_config`. + void start(); + void on_signal(); + size_t results(std::vector& results_buffer); + + virtual ~cpu_profiler() = default; + virtual void arm_timer(std::chrono::nanoseconds) = 0; + virtual void disarm_timer() = 0; + virtual bool is_spurious_signal() { return false; } + virtual std::optional + try_get_kernel_backtrace() { return std::nullopt; } +}; + +class cpu_profiler_posix_timer : public cpu_profiler { + posix_timer _timer; +public: + cpu_profiler_posix_timer(cpu_profiler_config cfg) + : cpu_profiler(cfg) + // CLOCK_MONOTONIC is used here in place of CLOCK_THREAD_CPUTIME_ID. + // This is since for intervals of ~5ms or less CLOCK_THREAD_CPUTIME_ID + // fires 200-600% after it's configured time. Therefore it is not granular + // enough for cases where the reactor is configured to sleep when idle and + // is only active for short intervals. CLOCK_MONOTONIC doesn't suffer from + // this issue. + , _timer({signal_number()}, CLOCK_MONOTONIC) {} + + virtual ~cpu_profiler_posix_timer() override = default; + virtual void arm_timer(std::chrono::nanoseconds) override; + virtual void disarm_timer() override; +}; + +class cpu_profiler_linux_perf_event : public cpu_profiler { + linux_perf_event _perf_event; +public: + static std::unique_ptr try_make(cpu_profiler_config); + cpu_profiler_linux_perf_event(linux_perf_event perf_event, cpu_profiler_config cfg) + : cpu_profiler(cfg) + , _perf_event(std::move(perf_event)) {} + + virtual ~cpu_profiler_linux_perf_event() override = default; + virtual void arm_timer(std::chrono::nanoseconds) override; + virtual void disarm_timer() override; + virtual bool is_spurious_signal() override; + virtual std::optional + try_get_kernel_backtrace() override; +}; + +std::unique_ptr make_cpu_profiler(cpu_profiler_config cfg = {false, std::chrono::milliseconds(100)}); + +} +} diff --git a/include/seastar/core/internal/signal_mutex.hh b/include/seastar/core/internal/signal_mutex.hh new file mode 100644 index 00000000000..e48841cc772 --- /dev/null +++ b/include/seastar/core/internal/signal_mutex.hh @@ -0,0 +1,52 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2025 ScyllaDB + */ + +#pragma once + +#include +#include + +namespace seastar::internal { + +/// A lightweight mutex designed to work with interrupts +/// utilizing only compiler barriers. +class signal_mutex { +public: + class guard { + private: + signal_mutex* _mutex; + guard(signal_mutex* m) : _mutex(m) {} + friend class signal_mutex; + public: + guard(guard&& o) : _mutex(o._mutex) { o._mutex = nullptr; } + ~guard(); + }; + + // Returns a `guard` if the lock was acquired. + // Otherwise returns a nullopt. + std::optional try_lock(); + +private: + friend class guard; + std::atomic_bool _mutex; +}; + +} // namespace seastar::internal diff --git a/include/seastar/core/internal/stall_detector.hh b/include/seastar/core/internal/stall_detector.hh index 9505e7cddea..8e167e60a53 100644 --- a/include/seastar/core/internal/stall_detector.hh +++ b/include/seastar/core/internal/stall_detector.hh @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace seastar { @@ -97,85 +98,25 @@ public: }; class cpu_stall_detector_posix_timer : public cpu_stall_detector { - timer_t _timer; + posix_timer _timer; public: explicit cpu_stall_detector_posix_timer(cpu_stall_detector_config cfg = {}); - virtual ~cpu_stall_detector_posix_timer() override; + virtual ~cpu_stall_detector_posix_timer() override = default; private: virtual void arm_timer() override; virtual void start_sleep() override; }; class cpu_stall_detector_linux_perf_event : public cpu_stall_detector { - file_desc _fd; - bool _enabled = false; - uint64_t _current_period = 0; - struct ::perf_event_mmap_page* _mmap; - char* _data_area; - size_t _data_area_mask; - // after the detector has been armed (i.e., _enabled is true), this - // is the moment at or after which the next signal is expected to occur - // and can be used for detecting spurious signals - sched_clock::time_point _next_signal_time{}; -private: - class data_area_reader { - cpu_stall_detector_linux_perf_event& _p; - const char* _data_area; - size_t _data_area_mask; - uint64_t _head; - uint64_t _tail; - public: - explicit data_area_reader(cpu_stall_detector_linux_perf_event& p) - : _p(p) - , _data_area(p._data_area) - , _data_area_mask(p._data_area_mask) { - _head = _p._mmap->data_head; - _tail = _p._mmap->data_tail; - std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head - } - ~data_area_reader() { - std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before writing data_tail - _p._mmap->data_tail = _tail; - } - uint64_t read_u64() { - uint64_t ret; - // We cannot wrap around if the 8-byte unit is aligned - std::copy_n(_data_area + (_tail & _data_area_mask), 8, reinterpret_cast(&ret)); - _tail += 8; - return ret; - } - template - S read_struct() { - static_assert(sizeof(S) % 8 == 0); - S ret; - char* p = reinterpret_cast(&ret); - for (size_t i = 0; i != sizeof(S); i += 8) { - uint64_t w = read_u64(); - std::copy_n(reinterpret_cast(&w), 8, p + i); - } - return ret; - } - void skip(uint64_t bytes_to_skip) { - _tail += bytes_to_skip; - } - // skip all the remaining data in the buffer, as-if calling read until - // have_data returns false (but much faster) - void skip_all() { - _tail = _head; - } - bool have_data() const { - return _head != _tail; - } - }; - - virtual void maybe_report_kernel_trace(backtrace_buffer& buf) override; + linux_perf_event _perf_event; public: static std::unique_ptr try_make(cpu_stall_detector_config cfg = {}); - explicit cpu_stall_detector_linux_perf_event(file_desc fd, cpu_stall_detector_config cfg = {}); - ~cpu_stall_detector_linux_perf_event(); + explicit cpu_stall_detector_linux_perf_event(linux_perf_event perf_event, cpu_stall_detector_config cfg = {}); + virtual ~cpu_stall_detector_linux_perf_event() override = default; virtual void arm_timer() override; virtual void start_sleep() override; virtual bool is_spurious_signal() override; + virtual void maybe_report_kernel_trace(backtrace_buffer& buf) override; }; std::unique_ptr make_cpu_stall_detector(cpu_stall_detector_config cfg = {}); diff --git a/include/seastar/core/internal/timers.hh b/include/seastar/core/internal/timers.hh new file mode 100644 index 00000000000..1aeb04d4c36 --- /dev/null +++ b/include/seastar/core/internal/timers.hh @@ -0,0 +1,147 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2023 ScyllaDB + */ + +#pragma once + +#ifndef SEASTAR_MODULE +#include +#include +#include +#include +#include + +#include +#endif + +#include +#include +#include + +namespace seastar { +namespace internal { + +struct timer_cfg { + int signal_number; +}; + +class posix_timer { + timer_t _timer; +public: + explicit posix_timer(timer_cfg cfg, clockid_t clock_id = CLOCK_THREAD_CPUTIME_ID); + virtual ~posix_timer(); + void arm_timer(std::chrono::nanoseconds); + void disarm_timer(); +}; + +class linux_perf_event { + file_desc _fd; + bool _enabled = false; + uint64_t _current_period = 0; + struct ::perf_event_mmap_page* _mmap; + char* _data_area; + size_t _data_area_mask; + // after the detector has been armed (i.e., _enabled is true), this + // is the moment at or after which the next signal is expected to occur + // and can be used for detecting spurious signals + sched_clock::time_point _next_signal_time{}; +private: + class data_area_reader { + std::reference_wrapper _p; + const char* _data_area; + size_t _data_area_mask; + uint64_t _head; + uint64_t _tail; + public: + explicit data_area_reader(linux_perf_event& p) + : _p(p) + , _data_area(p._data_area) + , _data_area_mask(p._data_area_mask) { + _head = _p.get()._mmap->data_head; + _tail = _p.get()._mmap->data_tail; + std::atomic_thread_fence(std::memory_order_acquire); // required after reading data_head + } + data_area_reader(data_area_reader&& o) + : _p(o._p) + , _data_area(o._data_area) + , _data_area_mask(o._data_area_mask) + , _head(o._head) + , _tail(o._tail) { + o._data_area = nullptr; + } + ~data_area_reader() { + if(_data_area != nullptr) { + std::atomic_thread_fence(std::memory_order_release); // not documented, but probably required before writing data_tail + _p.get()._mmap->data_tail = _tail; + } + } + uint64_t read_u64() { + + uint64_t ret; + // We cannot wrap around if the 8-byte unit is aligned + std::copy_n(_data_area + (_tail & _data_area_mask), 8, reinterpret_cast(&ret)); + _tail += 8; + return ret; + } + template + S read_struct() { + static_assert(sizeof(S) % 8 == 0); + S ret; + char* p = reinterpret_cast(&ret); + for (size_t i = 0; i != sizeof(S); i += 8) { + uint64_t w = read_u64(); + std::copy_n(reinterpret_cast(&w), 8, p + i); + } + return ret; + } + void skip(uint64_t bytes_to_skip) { + _tail += bytes_to_skip; + } + // skip all the remaining data in the buffer, as-if calling read until + // have_data returns false (but much faster) + void skip_all() { + _tail = _head; + } + bool have_data() const { + return _head != _tail; + } + }; + + explicit linux_perf_event(file_desc fd); +public: + + class kernel_backtrace { + data_area_reader _reader; + public: + kernel_backtrace(data_area_reader reader) : _reader(std::move(reader)) {} + void read_backtrace(std::function); + }; + + linux_perf_event(linux_perf_event&&); + static linux_perf_event try_make(timer_cfg cfg); + ~linux_perf_event(); + void arm_timer(std::chrono::nanoseconds); + void disarm_timer(); + bool is_spurious_signal(); + std::optional try_get_kernel_backtrace(); +}; + +} +} diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh index f8466f3496b..c9e6f8591b4 100644 --- a/include/seastar/core/io_queue.hh +++ b/include/seastar/core/io_queue.hh @@ -24,6 +24,7 @@ #ifndef SEASTAR_MODULE #include #include +#include #include #include #include @@ -196,6 +197,13 @@ public: double flow_ratio_backpressure_threshold = 1.1; std::chrono::milliseconds stall_threshold = std::chrono::milliseconds(100); std::chrono::microseconds tau = std::chrono::milliseconds(5); + + // Original values of io-properties (if available) + size_t read_bytes_rate = std::numeric_limits::max(); + size_t write_bytes_rate = std::numeric_limits::max(); + size_t read_req_rate = std::numeric_limits::max(); + size_t write_req_rate = std::numeric_limits::max(); + bool max_cost_function = true; }; io_queue(io_group_ptr group, internal::io_sink& sink); diff --git a/include/seastar/core/metrics.hh b/include/seastar/core/metrics.hh index 73bc293a046..ebc8cee3a91 100644 --- a/include/seastar/core/metrics.hh +++ b/include/seastar/core/metrics.hh @@ -405,6 +405,7 @@ public: virtual metric_groups_def& add_metric(group_name_type name, const metric_definition& md) = 0; virtual metric_groups_def& add_group(group_name_type name, const std::initializer_list& l) = 0; virtual metric_groups_def& add_group(group_name_type name, const std::vector& l) = 0; + virtual int get_handle() const = 0; }; instance_id_type shard(); @@ -679,6 +680,13 @@ impl::metric_definition_impl make_total_operations(metric_name_type name, return make_counter(name, std::forward(val), d, labels).set_type("total_operations"); } +/*! + * \brief Update the aggregation labels of a metric family + */ +void update_aggregate_labels(const group_name_type& group_name, + const metric_name_type& metric_name, + const std::vector