From c8887e2c62840f6bd758f5149ca3b8dc8503095e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Wed, 15 Apr 2026 10:51:12 +0100 Subject: [PATCH 1/5] ossl: use make_ossl_error for BIO creation failure Every other OpenSSL failure path in the session constructor uses make_ossl_error, which drains the error queue and includes the OpenSSL error details in the exception. This one used a plain std::runtime_error. Align it for consistency. --- src/net/ossl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/net/ossl.cc b/src/net/ossl.cc index e293dca217e..9d5bb6960fa 100644 --- a/src/net/ossl.cc +++ b/src/net/ossl.cc @@ -1023,7 +1023,7 @@ class session : public enable_shared_from_this, public session_impl { bio_ptr in_bio(BIO_new(get_method())); bio_ptr out_bio(BIO_new(get_method())); if (!in_bio || !out_bio) { - throw std::runtime_error("Failed to create BIOs"); + throw make_ossl_error("Failed to create BIOs"); } if (1 != BIO_ctrl(in_bio.get(), BIO_C_SET_POINTER, 0, this)) { throw make_ossl_error("Failed to set bio ptr to in bio"); From 808f6e2cda58b91618a48aa7fb047e978af03a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Wed, 15 Apr 2026 10:51:21 +0100 Subject: [PATCH 2/5] ossl: drain error queue on unexpected error codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drain the error queue and include the OpenSSL error details in the exception for the default switch cases in handle_do_put_ssl_err, do_handshake, do_get and do_shutdown. These default cases are not expected to be reachable — the SSL_get_error codes they cover require callbacks, modes, or BIO configurations that seastar does not use. This change is defensive: if they are ever reached, the error queue is now drained and the OpenSSL error details are included in the exception message rather than being silently discarded. --- src/net/ossl.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/net/ossl.cc b/src/net/ossl.cc index 9d5bb6960fa..51b6308aea0 100644 --- a/src/net/ossl.cc +++ b/src/net/ossl.cc @@ -183,6 +183,10 @@ std::system_error make_ossl_error(const std::string & msg) { return make_ossl_error(msg, get_all_ossl_errors()); } +std::runtime_error make_unknown_ossl_error(const std::string & msg) { + return std::runtime_error(fmt::format("{}: {}", msg, get_all_ossl_errors())); +} + bool contains_ossl_error(const std::vector & error_codes, int lib, int reason) { return std::any_of(error_codes.cbegin(), error_codes.cend(), [lib, reason](const ossl_errc & code) { return ERR_GET_LIB(static_cast(code)) == lib && @@ -1151,7 +1155,7 @@ class session : public enable_shared_from_this, public session_impl { default: { // Some other unhandled situation - auto err = std::runtime_error( + auto err = make_unknown_ossl_error( "Unknown error encountered during SSL write"); return handle_output_error(std::move(err)).then([] { return stop_iteration::yes; @@ -1326,7 +1330,7 @@ class session : public enable_shared_from_this, public session_impl { return handle_output_error(std::move(err)); } default: - auto err = std::runtime_error( + auto err = make_unknown_ossl_error( "Unknown error encountered during handshake"); return handle_output_error(std::move(err)); } @@ -1451,7 +1455,7 @@ class session : public enable_shared_from_this, public session_impl { return make_exception_future(_error); } default: - _error = std::make_exception_ptr(std::runtime_error( + _error = std::make_exception_ptr(make_unknown_ossl_error( "Unexpected error condition during SSL read")); return make_exception_future(_error); } @@ -1536,7 +1540,7 @@ class session : public enable_shared_from_this, public session_impl { } default: { - auto err = std::runtime_error( + auto err = make_unknown_ossl_error( "Unknown error occurred during SSL shutdown"); return handle_output_error(std::move(err)); } From 9b013e006d7b19a7f9f486406d0c1d2fb38a6eb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Wed, 15 Apr 2026 10:56:00 +0100 Subject: [PATCH 3/5] ossl: clear error queue after successful SSL_CTX_new MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SSL_CTX_new can return a valid context while leaving errors on the per-thread error queue. This happens when OpenSSL's system config parsing partially fails but the failure is masked — for example, ssl_do_config may call SSL_CONF_cmd which pushes errors (e.g. SSL_R_NO_CIPHER_MATCH from an invalid Ciphersuites value in the system openssl.cnf), but ssl_do_config itself returns success when the system flag is set and conf_diagnostics is disabled. Introduce a clear_stale_ssl_errors helper that drains and logs any stale errors at debug level, and use it after SSL_CTX_new succeeds. --- src/net/ossl.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/net/ossl.cc b/src/net/ossl.cc index 51b6308aea0..1cc5e52049b 100644 --- a/src/net/ossl.cc +++ b/src/net/ossl.cc @@ -1825,6 +1825,17 @@ SEASTAR_INTERNAL_END_IGNORE_DEPRECATIONS } private: + // Some SSL operations return success while leaving stale errors on the + // queue (e.g. from internal BIO write failures that OpenSSL absorbed). + // Drain them so they don't poison the next operation on this shard. + void clear_stale_ssl_errors(const char* operation) { + if (ERR_peek_error() == 0) [[likely]] { + return; + } + auto errors = get_all_ossl_errors(); + tls_log.debug("{} {}: ignoring stale errors on queue: {}", *this, operation, errors); + } + std::vector do_get_alt_name_information(const x509_ptr &peer_cert, const std::unordered_set &types) const { int ext_idx = X509_get_ext_by_NID( @@ -1993,6 +2004,11 @@ SEASTAR_INTERNAL_END_IGNORE_DEPRECATIONS throw make_ossl_error( "Failed to initialize SSL context"); } + // SSL_CTX_new can return a valid context while leaving errors on the + // error queue from partially-failed system config parsing (e.g. an + // invalid Ciphersuites value in the system openssl.cnf). + // See https://github.com/openssl/openssl/issues/30760 + clear_stale_ssl_errors("SSL_CTX_new"); const auto& ck_pair = _creds->get_certkey_pair(); if (type == session_type::SERVER) { if (!ck_pair) { From 8277c2aab1fcd8a1bcc4cc71a0db17c83515620f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Wed, 15 Apr 2026 10:58:15 +0100 Subject: [PATCH 4/5] ossl: clear error queue after successful SSL operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several SSL operations can return success while leaving errors on the per-thread error queue. This happens when OpenSSL internally writes through our custom BIO (e.g. to send a close_notify alert, a TLS 1.3 NewSessionTicket, or an application data record), our bwrite callback fails with EPIPE and returns 0, but OpenSSL's record layer misclassifies the failure as success. Root cause: bio_write_intern (crypto/bio/bio_lib.c) passes our bwrite return of 0 through unchanged. BIO_write returns 0. Then tls_retry_write_records (ssl/record/methods/tls_common.c) checks 'if (i >= 0)' and classifies BIO_write returning 0 as success. Our bwrite returns 0 following the documented BIO_write_ex contract (1 success, 0 failure), but BIO_write's own contract says 0 means "BIO is NULL or dlen <= 0" — not an error. The read side had this same class of bug, fixed on master in OpenSSL commit be42447469. The affected operations and how they reach our bwrite callback: - SSL_shutdown: ssl3_shutdown -> ssl3_send_alert -> ssl3_dispatch_alert -> write_records -> tls_retry_write_records -> BIO_write -> bio_write_intern -> our bwrite callback (sending close_notify alert) - SSL_do_handshake: state_machine -> TLS_ST_SW_SESSION_TICKET -> write_records -> tls_retry_write_records -> BIO_write -> bio_write_intern -> our bwrite callback (sending NewSessionTicket). Additionally, statem_srvr.c deliberately ignores flush failures via conn_is_closed() for this case. - SSL_write_ex: ssl3_write_bytes -> tls_write_records -> tls_retry_write_records -> BIO_write -> bio_write_intern -> our bwrite callback (sending application data record) - SSL_read_ex: can internally trigger writes (e.g. TLS 1.3 key update responses) through the same record layer write path. Not yet observed in test logs but covered defensively. This is a workaround for the missing return-value translation in bio_write_intern and can be removed once the upstream fix is available. --- src/net/ossl.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/net/ossl.cc b/src/net/ossl.cc index 1cc5e52049b..92f54828858 100644 --- a/src/net/ossl.cc +++ b/src/net/ossl.cc @@ -1205,6 +1205,7 @@ class session : public enable_shared_from_this, public session_impl { co_return; } } else { + clear_stale_ssl_errors("SSL_write_ex"); SEASTAR_ASSERT(bytes_written <= size); tls_log.trace("{} do_put: bytes_written: {}", *this, bytes_written); ptr += bytes_written; @@ -1335,6 +1336,7 @@ class session : public enable_shared_from_this, public session_impl { return handle_output_error(std::move(err)); } } else { + clear_stale_ssl_errors("SSL_do_handshake"); if (_type == session_type::CLIENT || _creds->get_client_auth() != client_auth::NONE) { verify(); @@ -1460,6 +1462,7 @@ class session : public enable_shared_from_this, public session_impl { return make_exception_future(_error); } } else { + clear_stale_ssl_errors("SSL_read_ex"); buf.trim(bytes_read); return make_ready_future(std::move(buf)); } @@ -1495,8 +1498,10 @@ class session : public enable_shared_from_this, public session_impl { auto res = SSL_shutdown(_ssl.get()); tls_log.trace("{} do_shutdown: SSL_shutdown: {}", *this, res); if (res == 1) { + clear_stale_ssl_errors("SSL_shutdown"); return wait_for_output(); } else if (res == 0) { + clear_stale_ssl_errors("SSL_shutdown"); return yield().then([this] { return do_shutdown(); }); } else { auto ssl_err = SSL_get_error(_ssl.get(), res); From e5f91e1ec63c02af0a5e28ead22e329dfe6d0836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Wed, 15 Apr 2026 10:53:06 +0100 Subject: [PATCH 5/5] ossl: assert clean error queue before SSL operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenSSL's SSL_get_error relies on the per-thread error queue to classify failures. In seastar's cooperative scheduling model, multiple TLS sessions share the same thread. If one session leaves stale errors on the queue and then yields, another session's SSL_get_error call may misclassify the error — e.g. reporting SSL_ERROR_SYSCALL instead of SSL_ERROR_SSL or vice versa — because SSL_get_error peeks at the oldest error on the queue to decide the classification (ssl_lib.c, ossl_ssl_get_error). We have seen error queue contamination cause issues in practice (e1625c8cc "net: avoid propagating system errors to errno", cd02ecc2b "ossl: Added ERR_clear_error if disconnected post write"). The current code is believed to be correct — all error paths drain the queue before scheduling points — but we add these checks to aid debugging if a regression is introduced in the future. Add verify_clean_error_queue() checks before every SSL_do_handshake, SSL_write_ex, SSL_read_ex and SSL_shutdown call. If stale errors are found, the first entry is logged at warn level and an assertion fires to surface the problem in tests. --- src/net/ossl.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/net/ossl.cc b/src/net/ossl.cc index 92f54828858..c8e241938e7 100644 --- a/src/net/ossl.cc +++ b/src/net/ossl.cc @@ -1195,6 +1195,7 @@ class session : public enable_shared_from_this, public session_impl { // This do_until runs until either a renegotiation occurs or the packet is empty while (!eof() && size > 0) { size_t bytes_written = 0; + verify_clean_error_queue("SSL_write_ex"); auto write_rc = SSL_write_ex(_ssl.get(), ptr, size, &bytes_written); tls_log.trace("{} do_put: SSL_write_ex: {}", *this, write_rc); if (write_rc != 1) { @@ -1287,6 +1288,7 @@ class session : public enable_shared_from_this, public session_impl { [this] { return connected() || eof(); }, [this] { try { + verify_clean_error_queue("SSL_do_handshake"); auto n = SSL_do_handshake(_ssl.get()); tls_log.trace("{} do_handshake: SSL_do_handshake: {}", *this, n); if (n <= 0) { @@ -1399,6 +1401,7 @@ class session : public enable_shared_from_this, public session_impl { tls_log.trace("{} do_get: available: {}", *this, avail); buf_type buf(avail); size_t bytes_read = 0; + verify_clean_error_queue("SSL_read_ex"); auto read_result = SSL_read_ex( _ssl.get(), buf.get_write(), avail, &bytes_read); tls_log.trace("{} do_get: SSL_read_ex: {}", *this, read_result); @@ -1495,6 +1498,7 @@ class session : public enable_shared_from_this, public session_impl { return make_ready_future(); } + verify_clean_error_queue("SSL_shutdown"); auto res = SSL_shutdown(_ssl.get()); tls_log.trace("{} do_shutdown: SSL_shutdown: {}", *this, res); if (res == 1) { @@ -1841,6 +1845,22 @@ SEASTAR_INTERNAL_END_IGNORE_DEPRECATIONS tls_log.debug("{} {}: ignoring stale errors on queue: {}", *this, operation, errors); } + // Checks that the OpenSSL per-thread error queue is clean before + // calling an SSL function. A dirty queue can cause SSL_get_error + // to misclassify results (e.g. reporting SSL_ERROR_SYSCALL instead + // of SSL_ERROR_SSL), which can affect unrelated sessions that share + // the same thread. + void verify_clean_error_queue(const char* operation) { + auto err = ERR_peek_error(); + if (err == 0) [[likely]] { + return; + } + char buf[256]; + ERR_error_string_n(err, buf, sizeof(buf)); + tls_log.warn("{} stale error on queue before {}: {}", *this, operation, buf); + SEASTAR_ASSERT(0 && "stale errors on OpenSSL error queue"); + } + std::vector do_get_alt_name_information(const x509_ptr &peer_cert, const std::unordered_set &types) const { int ext_idx = X509_get_ext_by_NID(