From 9f4fd7eebc5400a3248752145e9c8307748429fa Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:08 +1030 Subject: [PATCH 01/22] pytest: don't run tests marked slow_test at all if VALGRIND and SLOW_MACHINE. We used to just run these without valgrind, but we already run them in CI (which sets SLOW_MACHINE) without valgrind, so this just doubles up. Signed-off-by: Rusty Russell --- contrib/pyln-testing/pyln/testing/utils.py | 14 +++++--------- tests/conftest.py | 4 +++- tests/test_closing.py | 4 ++-- tests/test_connection.py | 16 ++++++++-------- tests/test_plugin.py | 2 +- tests/test_reckless.py | 4 ++-- 6 files changed, 21 insertions(+), 23 deletions(-) diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 8607cd9bdf1d..732bd7b49012 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -841,7 +841,7 @@ def call(self, method, payload=None, cmdprefix=None, filter=None): class LightningNode(object): - def __init__(self, node_id, lightning_dir, bitcoind, executor, valgrind, may_fail=False, + def __init__(self, node_id, lightning_dir, bitcoind, executor, may_fail=False, may_reconnect=False, broken_log=None, allow_warning=False, @@ -900,7 +900,7 @@ def __init__(self, node_id, lightning_dir, bitcoind, executor, valgrind, may_fai self.daemon.opts["dev-debugger"] = dbgvar if os.getenv("DEBUG_LIGHTNINGD"): self.daemon.opts["dev-debug-self"] = None - if valgrind: + if VALGRIND: self.daemon.env["LIGHTNINGD_DEV_NO_BACKTRACE"] = "1" self.daemon.opts["dev-no-plugin-checksum"] = None else: @@ -926,7 +926,7 @@ def __init__(self, node_id, lightning_dir, bitcoind, executor, valgrind, may_fai dsn = db.get_dsn() if dsn is not None: self.daemon.opts['wallet'] = dsn - if valgrind: + if VALGRIND: trace_skip_pattern = '*python*,*bitcoin-cli*,*elements-cli*,*cln-grpc*,*clnrest*,*wss-proxy*,*cln-bip353*,*reckless' if not valgrind_plugins: trace_skip_pattern += ',*plugins*' @@ -1653,10 +1653,6 @@ class NodeFactory(object): """ def __init__(self, request, testname, bitcoind, executor, directory, db_provider, node_cls, jsonschemas): - if request.node.get_closest_marker("slow_test") and SLOW_MACHINE: - self.valgrind = False - else: - self.valgrind = VALGRIND self.testname = testname # Set test name in environment for coverage file organization @@ -1755,7 +1751,7 @@ def get_node(self, node_id=None, options=None, dbfile=None, db = self.db_provider.get_db(os.path.join(lightning_dir, TEST_NETWORK), self.testname, node_id) db.provider = self.db_provider node = self.node_cls( - node_id, lightning_dir, self.bitcoind, self.executor, self.valgrind, db=db, + node_id, lightning_dir, self.bitcoind, self.executor, db=db, port=port, grpc_port=grpc_port, options=options, may_fail=may_fail or expect_fail, jsonschemas=self.jsonschemas, **kwargs @@ -1872,7 +1868,7 @@ def killall(self, expected_successes): # leak detection upsets VALGRIND by reading uninitialized mem, # and valgrind adds extra fds. # If it's dead, we'll catch it below. - if not self.valgrind: + if not VALGRIND: try: # This also puts leaks in log. leaks = self.nodes[i].rpc.dev_memleak()['leaks'] diff --git a/tests/conftest.py b/tests/conftest.py index 029050742e06..dcbd43cabb4a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ import pytest -from pyln.testing.utils import EXPERIMENTAL_DUAL_FUND +from pyln.testing.utils import EXPERIMENTAL_DUAL_FUND, VALGRIND, SLOW_MACHINE # This function is based upon the example of how to @@ -37,3 +37,5 @@ def pytest_runtest_setup(item): else: # If there's no openchannel marker, skip if EXP_DF if EXPERIMENTAL_DUAL_FUND: pytest.skip('v1-only test, EXPERIMENTAL_DUAL_FUND=1') + if "slow_test" in item.keywords and VALGRIND and SLOW_MACHINE: + pytest.skip("Skipping slow tests under VALGRIND") diff --git a/tests/test_closing.py b/tests/test_closing.py index 3a7c31642473..f3c7dbee06cc 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -1,7 +1,7 @@ from fixtures import * # noqa: F401,F403 from pyln.client import RpcError, Millisatoshi from shutil import copyfile -from pyln.testing.utils import SLOW_MACHINE +from pyln.testing.utils import SLOW_MACHINE, VALGRIND from utils import ( only_one, sync_blockheight, wait_for, TIMEOUT, account_balance, first_channel_id, closing_fee, TEST_NETWORK, @@ -3232,7 +3232,7 @@ def check_billboard(): def test_shutdown(node_factory): # Fail, in that it will exit before cleanup. l1 = node_factory.get_node(may_fail=True) - if not node_factory.valgrind: + if not VALGRIND: leaks = l1.rpc.dev_memleak()['leaks'] if len(leaks): raise Exception("Node {} has memory leaks: {}" diff --git a/tests/test_connection.py b/tests/test_connection.py index 05724c517ec8..78261f7119a7 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -13,7 +13,7 @@ mine_funding_to_announce, first_scid, CHANNEL_SIZE ) -from pyln.testing.utils import SLOW_MACHINE, VALGRIND, EXPERIMENTAL_DUAL_FUND, FUNDAMOUNT, RUST +from pyln.testing.utils import VALGRIND, EXPERIMENTAL_DUAL_FUND, FUNDAMOUNT, RUST import os import pytest @@ -1463,7 +1463,7 @@ def test_funding_v2_corners(node_factory, bitcoind): assert l1.rpc.openchannel_update(start['channel_id'], start['psbt'])['commitments_secured'] -@unittest.skipIf(SLOW_MACHINE and not VALGRIND, "Way too taxing on CI machines") +@pytest.mark.slow_test @pytest.mark.openchannel('v1') def test_funding_cancel_race(node_factory, bitcoind, executor): l1 = node_factory.get_node() @@ -1474,7 +1474,7 @@ def test_funding_cancel_race(node_factory, bitcoind, executor): bitcoind.generate_block(1) wait_for(lambda: len(l1.rpc.listfunds()["outputs"]) != 0) - if node_factory.valgrind: + if VALGRIND: num = 5 else: num = 100 @@ -1536,7 +1536,7 @@ def test_funding_cancel_race(node_factory, bitcoind, executor): assert num_cancel == len(nodes) # We should have raced at least once! - if not node_factory.valgrind: + if not VALGRIND: assert num_cancel > 0 assert num_complete > 0 @@ -1544,7 +1544,7 @@ def test_funding_cancel_race(node_factory, bitcoind, executor): executor.map(lambda n: n.stop(), node_factory.nodes) -@unittest.skipIf(SLOW_MACHINE and not VALGRIND, "Way too taxing on CI machines") +@pytest.mark.slow_test @pytest.mark.openchannel('v2') def test_funding_v2_cancel_race(node_factory, bitcoind, executor): l1 = node_factory.get_node() @@ -1555,7 +1555,7 @@ def test_funding_v2_cancel_race(node_factory, bitcoind, executor): bitcoind.generate_block(1) wait_for(lambda: len(l1.rpc.listfunds()["outputs"]) != 0) - if node_factory.valgrind: + if VALGRIND: num = 5 else: num = 100 @@ -1610,7 +1610,7 @@ def test_funding_v2_cancel_race(node_factory, bitcoind, executor): print("Cancelled {} complete {}".format(num_cancel, num_complete)) # We should have raced at least once! - if not node_factory.valgrind: + if not VALGRIND: assert num_cancel > 0 assert num_complete > 0 @@ -3452,7 +3452,7 @@ def test_feerate_stress(node_factory, executor): @pytest.mark.slow_test def test_pay_disconnect_stress(node_factory, executor): """Expose race in htlc restoration in channeld: 50% chance of failure""" - if node_factory.valgrind: + if VALGRIND: NUM_RUNS = 2 else: NUM_RUNS = 5 diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 475460291eb9..1fce11e0534d 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2983,7 +2983,7 @@ def test_autoclean(node_factory): # Under valgrind in CI, it can 50 seconds between creating invoice # and restarting. - if node_factory.valgrind: + if VALGRIND: short_timeout = 10 longer_timeout = 60 else: diff --git a/tests/test_reckless.py b/tests/test_reckless.py index 7fb04d742726..275960d576fb 100644 --- a/tests/test_reckless.py +++ b/tests/test_reckless.py @@ -2,7 +2,7 @@ import subprocess from pathlib import PosixPath, Path import socket -from pyln.testing.utils import VALGRIND, SLOW_MACHINE +from pyln.testing.utils import VALGRIND import pytest import os import re @@ -352,7 +352,7 @@ def test_tag_install(node_factory): @pytest.mark.flaky(reruns=5) -@unittest.skipIf(VALGRIND and SLOW_MACHINE, "node too slow for starting plugin under valgrind") +@pytest.mark.slow_test def test_reckless_uv_install(node_factory): node = get_reckless_node(node_factory) node.start() From b76e4408868751894b0baab48d30f694b4b26495 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:11 +1030 Subject: [PATCH 02/22] CI: remove reruns on all failures. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is covering up real bugs; we need to fix CI instead. From https://pypi.org/project/pytest-rerunfailures/#re-run-all-failures: ``` To re-run all test failures, use the --reruns command line option with the maximum number of times you’d like the tests to run: $ pytest --reruns 5 ``` Signed-off-by: Rusty Russell --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 535afd2b8e92..51a02bf2e202 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,7 +16,7 @@ env: RUST_PROFILE: release SLOW_MACHINE: 1 CI_SERVER_URL: "http://35.239.136.52:3170" - GLOBAL_PYTEST_OPTS: "--reruns=10 -vvv" + GLOBAL_PYTEST_OPTS: "-vvv" jobs: prebuild: @@ -431,7 +431,7 @@ jobs: env: RUST_PROFILE: release # Has to match the one in the compile step CFG: compile-gcc - PYTEST_OPTS: --test-group-random-seed=42 --timeout=1800 --durations=10 --reruns=10 + PYTEST_OPTS: --test-group-random-seed=42 --timeout=1800 --durations=10 needs: - compile strategy: @@ -502,7 +502,7 @@ jobs: RUST_PROFILE: release SLOW_MACHINE: 1 TEST_DEBUG: 1 - PYTEST_OPTS: --test-group-random-seed=42 --timeout=1800 --durations=10 --reruns=10 + PYTEST_OPTS: --test-group-random-seed=42 --timeout=1800 --durations=10 needs: - compile strategy: From cccf312f1d5c3b347b73bc1454dc7f7338bcc5e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:11 +1030 Subject: [PATCH 03/22] connectd: fix race when we supply a new address. This shows up as a flake in test_route_by_old_scid: ``` # Now restart l2, make sure it remembers the original! l2.restart() > l2.rpc.connect(l1.info['id'], 'localhost', l1.port) tests/test_splicing.py:554: ... > raise RpcError(method, payload, resp['error']) E pyln.client.lightning.RpcError: RPC call failed: method: connect, payload: {'id': '0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518', 'host': 'localhost', 'port': 33837}, error: {'code': 400, 'message': 'Unable to connect, no address known for peer'} ``` This is because it's already (auto)connecting, and fails. This failure is reported, before we've added the new address (once we add the new address, we connect fine, but it's too late!): ``` lightningd-2 2025-12-08T02:39:18.241Z DEBUG gossipd: REPLY WIRE_GOSSIPD_NEW_BLOCKHEIGHT_REPLY with 0 fds lightningd-2 2025-12-08T02:39:18.320Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Initializing important peer with 0 addresses lightningd-2 2025-12-08T02:39:18.320Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Failed connected out: Unable to connect, no address known for peer lightningd-2 2025-12-08T02:39:18.344Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Will try reconnect in 1 seconds lightningd-2 2025-12-08T02:39:18.344Z DEBUG 035d2b1192dfba134e10e540875d366ebc8bc353d5aa766b80c090b39c3a5d885d-connectd: Initializing important peer with 1 addresses lightningd-2 2025-12-08T02:39:18.344Z DEBUG 035d2b1192dfba134e10e540875d366ebc8bc353d5aa766b80c090b39c3a5d885d-connectd: Connected out, starting crypto lightningd-2 2025-12-08T02:39:18.344Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Adding 1 addresses to important peer lightningd-2 2025-12-08T02:39:18.345Z DEBUG 0266e4598d1d3c415f572a8488830b60f7e744ed9235eb0b1ba93283b315c03518-connectd: Connected out, starting crypto {'run_id': 256236335046680576, 'github_repository': 'ElementsProject/lightning', 'github_sha': '555f1ac461d151064aad6fc83b94a0290e2e9d5d', 'github_ref': 'refs/pull/8767/merge', 'github_ref_name': 'HEAD', 'github_run_id': 20013689862, 'github_head_ref': 'fixup-backfill-bug', 'github_run_number': 14774, 'github_base_ref': 'master', 'github_run_attempt': '1', 'testname': 'test_route_by_old_scid', 'start_time': 1765161493, 'end_time': 1765161558, 'outcome': 'fail'} lightningd-2 2025-12-08T02:39:18.421Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-hsmd: Got WIRE_HSMD_ECDH_REQ lightningd-2 2025-12-08T02:39:18.421Z DEBUG hsmd: Client: Received message 1 from client lightningd-2 2025-12-08T02:39:18.453Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-hsmd: Got WIRE_HSMD_ECDH_REQ lightningd-2 2025-12-08T02:39:18.453Z DEBUG hsmd: Client: Received message 1 from client --------------------------- Captured stdout teardown --------------------------- ``` --- lightningd/connect_control.c | 19 +++++++++++++++---- tests/test_splicing.py | 1 - 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index e374ae961606..f2710db52b7a 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -277,10 +277,21 @@ static void connect_failed(struct lightningd *ld, connect_nsec, connect_attempted); - /* We can have multiple connect commands: fail them all */ - while ((c = find_connect(ld, id)) != NULL) { - /* They delete themselves from list */ - was_pending(command_fail(c->cmd, errcode, "%s", errmsg)); + /* There's a race between autoreconnect and connect commands. This + * matters because the autoreconnect might have failed, but that was before + * the connect_to_peer command gave connectd a new address. This we wait for + * one we explicitly asked for before failing. + * + * A similar pattern could occur with multiple connect commands, however connectd + * does simply combine those, so we don't get a response per request, and it's a + * very rare corner case (which, unlike the above, doesn't happen in CI!). + */ + if (strstarts(connect_reason, "connect command")) { + /* We can have multiple connect commands: fail them all */ + while ((c = find_connect(ld, id)) != NULL) { + /* They delete themselves from list */ + was_pending(command_fail(c->cmd, errcode, "%s", errmsg)); + } } } diff --git a/tests/test_splicing.py b/tests/test_splicing.py index 0c5b71ac614e..966c75f64e7b 100644 --- a/tests/test_splicing.py +++ b/tests/test_splicing.py @@ -499,7 +499,6 @@ def test_splice_stuck_htlc(node_factory, bitcoind, executor): assert l1.db_query("SELECT count(*) as c FROM channeltxs;")[0]['c'] == 0 -@pytest.mark.flaky(reruns=5) @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') def test_route_by_old_scid(node_factory, bitcoind): l1, l2, l3 = node_factory.line_graph(3, wait_for_announce=True, opts={'experimental-splicing': None, 'may_reconnect': True}) From 2ea7461a3bc7cb9bdb8346657cbf7cf1cf9a5e8e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:11 +1030 Subject: [PATCH 04/22] pytest: fix real reason for warning issue in test_route_by_old_scid. We can still get a warning: lightningd-1 2025-12-10T01:11:07.232Z DEBUG 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: Received WIRE_WARNING: WARNING: channel_announcement: no unspent txout 109x1x1 This has nothing to do with l1 talking about the original channel (which would be 103x1x): it's because l2's gossipd (being the node which does the splice) immediately forgets the pre-splice id. If l1 sends some gossip, it will get a warning message. Signed-off-by: Rusty Russell --- tests/test_splicing.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_splicing.py b/tests/test_splicing.py index 966c75f64e7b..e5a3565b1394 100644 --- a/tests/test_splicing.py +++ b/tests/test_splicing.py @@ -501,7 +501,13 @@ def test_splice_stuck_htlc(node_factory, bitcoind, executor): @unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') def test_route_by_old_scid(node_factory, bitcoind): - l1, l2, l3 = node_factory.line_graph(3, wait_for_announce=True, opts={'experimental-splicing': None, 'may_reconnect': True}) + opts = {'experimental-splicing': None, 'may_reconnect': True} + # l1 sometimes talks about pre-splice channels. l2 (being part of the splice) immediately forgets + # the old scid and uses the new one, then complains when l1 talks about it. Which is fine, but + # breaks CI. + l1opts = opts.copy() + l1opts['allow_warning'] = True + l1, l2, l3 = node_factory.line_graph(3, wait_for_announce=True, opts=[l1opts, opts, opts]) # Get pre-splice route. inv = l3.rpc.invoice(10000000, 'test_route_by_old_scid', 'test_route_by_old_scid') @@ -527,11 +533,6 @@ def test_route_by_old_scid(node_factory, bitcoind): l1.rpc.sendpay(route, inv['payment_hash'], payment_secret=inv['payment_secret']) l1.rpc.waitsendpay(inv['payment_hash']) - # Make sure l1 has seen and processed announcement for new splice - # scid, otherwise we can get gossip warning here (which breaks CI) if we splice again. - scid = only_one(l3.rpc.listchannels(source=l3.info['id'])['channels'])['short_channel_id'] - wait_for(lambda: l1.rpc.listchannels(short_channel_id=scid)['channels'] != []) - # Let's splice again, so the original scid is two behind the times. l3.fundwallet(200000) funds_result = l3.rpc.fundpsbt("109000sat", "slow", 166, excess_as_change=True) From 112e7ef62954b58f8a5c9ddce2d01b6699f27df2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:11 +1030 Subject: [PATCH 05/22] pytest: remove test_lockup_drain. Since this was written, we now test if remote side would get into this situation and stop it from happening, so the test doesn't work any more. Signed-off-by: Rusty Russell --- tests/test_pay.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/tests/test_pay.py b/tests/test_pay.py index 442e35d73756..277c19162913 100644 --- a/tests/test_pay.py +++ b/tests/test_pay.py @@ -2810,31 +2810,6 @@ def test_channel_spendable_receivable_capped(node_factory, bitcoind): assert l2.rpc.listpeerchannels()['channels'][0]['receivable_msat'] == Millisatoshi(0xFFFFFFFF) -@unittest.skipIf(True, "Test is extremely flaky") -def test_lockup_drain(node_factory, bitcoind): - """Try to get channel into a state where opener can't afford fees on additional HTLC, so peer can't add HTLC""" - l1, l2 = node_factory.line_graph(2, opts={'may_reconnect': True}) - - # l1 sends all the money to l2 until even 1 msat can't get through. - total = l1.drain(l2) - - # Even if feerate now increases 2x (30000), l2 should be able to send - # non-dust HTLC to l1. - l1.force_feerates(30000) - l2.pay(l1, total // 2) - - # reset fees and send all back again - l1.force_feerates(15000) - l1.drain(l2) - - # But if feerate increase just a little more, l2 should not be able to send - # non-fust HTLC to l1 - l1.force_feerates(30002) # TODO: Why does 30001 fail? off by one in C code? - wait_for(lambda: l1.rpc.listpeers()['peers'][0]['connected']) - with pytest.raises(RpcError, match=r".*Capacity exceeded.*"): - l2.pay(l1, total // 2) - - @unittest.skipIf(TEST_NETWORK != 'regtest', 'Assumes anchors') def test_htlc_too_dusty_outgoing(node_factory, bitcoind, chainparams): """ Try to hit the 'too much dust' limit, should fail the HTLC """ From e33ab731c294800c2d0d79019c70b8f2c13cd886 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:12 +1030 Subject: [PATCH 06/22] pytest: restore and fix disabled test test_excluded_adjacent_routehint. 1. It was flaky, probably because it didn't wait for the remote update_channel. 2. Rusty applied a fix in 5f664dac77d, not clear if it worked. 3. Christian disabled it altogether in 23ce9a947df. Signed-off-by: Rusty Russell --- lightningd/connect_control.c | 3 ++- tests/test_pay.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index f2710db52b7a..611b5ba265fe 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -286,7 +286,8 @@ static void connect_failed(struct lightningd *ld, * does simply combine those, so we don't get a response per request, and it's a * very rare corner case (which, unlike the above, doesn't happen in CI!). */ - if (strstarts(connect_reason, "connect command")) { + if (strstarts(connect_reason, "connect command") + || errcode == CONNECT_DISCONNECTED_DURING) { /* We can have multiple connect commands: fail them all */ while ((c = find_connect(ld, id)) != NULL) { /* They delete themselves from list */ diff --git a/tests/test_pay.py b/tests/test_pay.py index 277c19162913..41bdc3b601fa 100644 --- a/tests/test_pay.py +++ b/tests/test_pay.py @@ -3469,7 +3469,6 @@ def test_reject_invalid_payload(node_factory): l2.daemon.wait_for_log(r'Failing HTLC because of an invalid payload') -@unittest.skip("Test is flaky causing CI to be unusable.") def test_excluded_adjacent_routehint(node_factory, bitcoind): """Test case where we try have a routehint which leads to an adjacent node, but the result exceeds our maxfee; we crashed trying to find @@ -3478,10 +3477,15 @@ def test_excluded_adjacent_routehint(node_factory, bitcoind): """ l1, l2, l3 = node_factory.line_graph(3) + # Make sure l2->l3 is usable. + wait_for(lambda: 'remote' in only_one(l3.rpc.listpeerchannels()['channels'])['updates']) + # We'll be forced to use routehint, since we don't know about l3. inv = l3.rpc.invoice(10**3, "lbl", "desc", exposeprivatechannels=l2.get_channel_scid(l3)) - l1.wait_channel_active(l1.get_channel_scid(l2)) + # Make sure l1->l2 is usable. + wait_for(lambda: 'remote' in only_one(l1.rpc.listpeerchannels()['channels'])['updates']) + # This will make it reject the routehint. err = r'Fee exceeds our fee budget: 1msat > 0msat, discarding route' with pytest.raises(RpcError, match=err): From afe38169ed38ccdbb2cb3c85659da797f6f7bf21 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:12 +1030 Subject: [PATCH 07/22] pytest: expect slow commands with giant commando test ``` 2025-12-10T02:51:06.2435409Z [gw1] [ 77%] ERROR tests/test_plugin.py::test_commando ...lightningd-1: had BROKEN messages ... 2025-12-10T03:00:26.0440311Z lightningd-1 2025-12-10T02:51:01.548Z UNUSUAL jsonrpc#69: That's weird: Request checkrune took 5961 milliseconds ``` Signed-off-by: Rusty Russell --- tests/test_plugin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 1fce11e0534d..fd644a56b999 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2845,7 +2845,8 @@ def test_plugin_shutdown(node_factory): def test_commando(node_factory, executor): l1, l2 = node_factory.line_graph(2, fundchannel=False, - opts={'log-level': 'io'}) + # Under valgrind, checkrune of 400k command can be slow! + opts={'log-level': 'io', 'broken_log': "That's weird: Request .* took"}) rune = l1.rpc.createrune()['rune'] From f985ac45429bcaa0504d00a8185173e96c91c620 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:12 +1030 Subject: [PATCH 08/22] pytest: fix test_bitcoin_backend_gianttx flake. signpsbt could be the one which takes a long time, so allow any psbt event. Signed-off-by: Rusty Russell --- tests/test_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index fd644a56b999..c589d80c2c05 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -1904,7 +1904,7 @@ def test_bitcoin_backend(node_factory, bitcoind): def test_bitcoin_backend_gianttx(node_factory, bitcoind): """Test that a giant tx doesn't crash bcli""" # This complains about how long fundpsbt took. - l1 = node_factory.get_node(start=False, broken_log='Request fundpsbt took') + l1 = node_factory.get_node(start=False, broken_log="That's weird: Request .*psbt took") # With memleak we spend far too much time gathering backtraces. if "LIGHTNINGD_DEV_MEMLEAK" in l1.daemon.env: del l1.daemon.env["LIGHTNINGD_DEV_MEMLEAK"] From 97ca41368c22663dfd8ce1bb7ec339c90c45659d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:12 +1030 Subject: [PATCH 09/22] pytest: note that we also trigger CI failure on this "That's weird" messages. Signed-off-by: Rusty Russell --- contrib/pyln-testing/pyln/testing/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pyln-testing/pyln/testing/fixtures.py b/contrib/pyln-testing/pyln/testing/fixtures.py index db4206a1279c..ef55ecd0aa65 100644 --- a/contrib/pyln-testing/pyln/testing/fixtures.py +++ b/contrib/pyln-testing/pyln/testing/fixtures.py @@ -498,7 +498,7 @@ def map_node_error(nodes, f, msg): map_node_error(nf.nodes, printValgrindErrors, "reported valgrind errors") map_node_error(nf.nodes, printCrashLog, "had crash.log files") - map_node_error(nf.nodes, checkBroken, "had BROKEN messages") + map_node_error(nf.nodes, checkBroken, "had BROKEN or That's weird messages") map_node_error(nf.nodes, lambda n: not n.allow_warning and n.daemon.is_in_log(r' WARNING:'), "had warning messages") map_node_error(nf.nodes, checkReconnect, "had unexpected reconnections") map_node_error(nf.nodes, checkPluginJSON, "had malformed hooks/notifications") From 05beca1a9a94b4832e735078db92c886a26d16e4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:13 +1030 Subject: [PATCH 10/22] pytest: fix flake in test_coin_movement_notices We restart the nodeL if the coin_movements.py plugin hasn't processed the notification yet, it will be incorrect: ``` > assert account_balance(l2, chanid_1) == 100001001 E AssertionError: assert 150_001_001msat == 100_001_001 E + where 150001001msat = account_balance(, '39ac52c818c5304cf0664940ff236c4e3f8f4ceb8993cb1491347142d61b62bc') ``` Signed-off-by: Rusty Russell --- tests/test_plugin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index c589d80c2c05..32fb94506d2a 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2191,7 +2191,6 @@ def test_plugin_fail(node_factory): l1.daemon.wait_for_log(r': exited during normal operation') -@pytest.mark.flaky(reruns=5) @pytest.mark.openchannel('v1') @pytest.mark.openchannel('v2') def test_coin_movement_notices(node_factory, bitcoind, chainparams): @@ -2269,6 +2268,9 @@ def test_coin_movement_notices(node_factory, bitcoind, chainparams): l2.rpc.sendpay(route, payment_hash21, payment_secret=inv['payment_secret']) l2.rpc.waitsendpay(payment_hash21) + # Make sure coin_movements.py sees event before we restart! + l2.daemon.wait_for_log(f"plugin-coin_movements.py: coin movement: .*'payment_hash': '{payment_hash21}'") + # restart to test index l2.restart() wait_for(lambda: all(c['state'] == 'CHANNELD_NORMAL' for c in l2.rpc.listpeerchannels()["channels"])) From 3c3b86af72c275387185f27f4e7528ab057d6028 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:13 +1030 Subject: [PATCH 11/22] patch enable-offline-test.patch --- tests/test_connection.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 78261f7119a7..92a2a12f9487 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -4523,15 +4523,24 @@ def test_reconnect_no_additional_transient_failure(node_factory, bitcoind): assert not l1.daemon.is_in_log(f"{l2id}-chan#1: Peer transient failure in CHANNELD_NORMAL: Disconnected", start=offset1) -@pytest.mark.xfail(strict=True) def test_offline(node_factory): # if get_node starts it, it'll expect an address, so do it manually. l1 = node_factory.get_node(options={"offline": None}, start=False) l1.daemon.start() - # we expect it to log offline mode an not to create any listener + # we expect it to log offline mode not to listen. assert l1.daemon.is_in_log("Started in offline mode!") - assert not l1.daemon.is_in_log("connectd: Created listener on") + line = l1.daemon.is_in_log("connectd: Created listener on") + port = re.search(r'connectd: Created listener on 127.0.0.1:(.*)', line).groups()[0] + + l2 = node_factory.get_node() + + # We cannot connect in! + with pytest.raises(RpcError, match=f"All addresses failed: 127.0.0.1:{port}: Connection establishment: Connection refused."): + l2.rpc.connect(l1.rpc.getinfo()['id'], 'localhost', int(port)) + + # We *can* connect out + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) def test_last_stable_connection(node_factory): From 6a00216adbadc758fd33af9bee30258d82d34a1f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:13 +1030 Subject: [PATCH 12/22] pytest: fix feerate check in test_peer_anchor_push We didn't log when anchor transactions had short signatures, which causes this test to not assert (did_short_sig): ``` total_feerate_perkw = total_fees / total_weight * 1000 > check_feerate([l3, l2], total_feerate_perkw, feerate) tests/test_closing.py:4063: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ nodes = [, ] actual_feerate = 14006.105538595726, expected_feerate = 14000 def check_feerate(nodes, actual_feerate, expected_feerate): # Feerate can't be lower. assert actual_feerate > expected_feerate - 2 if actual_feerate >= expected_feerate + 2: if any([did_short_sig(n) for n in nodes]): return # Use assert as it shows the actual values on failure > assert actual_feerate < expected_feerate + 2 E AssertionError ``` Signed-off-by: Rusty Russell --- hsmd/libhsmd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hsmd/libhsmd.c b/hsmd/libhsmd.c index e4a14e44cf14..dd2329236abb 100644 --- a/hsmd/libhsmd.c +++ b/hsmd/libhsmd.c @@ -1820,6 +1820,13 @@ static u8 *handle_sign_anchorspend(struct hsmd_client *c, const u8 *msg_in) fmt_pubkey(tmpctx, &local_funding_pubkey), fmt_wally_psbt(tmpctx, psbt)); } + if (dev_warn_on_overgrind + && psbt->inputs[0].signatures.num_items == 1 + && psbt->inputs[0].signatures.items[0].value_len < 71) { + hsmd_status_fmt(LOG_BROKEN, NULL, + "overgrind: short signature length %zu", + psbt->inputs[0].signatures.items[0].value_len); + } return towire_hsmd_sign_anchorspend_reply(NULL, psbt); } From d3801c174ddd7d879e0fdeb91576059d9d3735e0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:13 +1030 Subject: [PATCH 13/22] pytest: test the askrene doesn't use local dying channels. We don't want it to think that it can use both pre-splice and post-splice channels! Signed-off-by: Rusty Russell --- tests/test_askrene.py | 46 ++++++++++++++++++++++++++++++++++++++++++- tests/utils.py | 5 +++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/tests/test_askrene.py b/tests/test_askrene.py index bf87287e9b05..613cfa1b990f 100644 --- a/tests/test_askrene.py +++ b/tests/test_askrene.py @@ -3,7 +3,7 @@ from pyln.client import RpcError from pyln.testing.utils import SLOW_MACHINE from utils import ( - only_one, first_scid, GenChannel, generate_gossip_store, + only_one, first_scid, first_scidd, GenChannel, generate_gossip_store, sync_blockheight, wait_for, TEST_NETWORK, TIMEOUT, mine_funding_to_announce ) import os @@ -11,6 +11,7 @@ import subprocess import time import tempfile +import unittest def direction(src, dst): @@ -1915,3 +1916,46 @@ def test_askrene_reserve_clash(node_factory, bitcoind): layers=['layer2'], maxfee_msat=1000, final_cltv=5) + + +@unittest.skipIf(TEST_NETWORK != 'regtest', 'elementsd doesnt yet support PSBT features we need') +@pytest.mark.openchannel('v1') +@pytest.mark.openchannel('v2') +def test_splice_dying_channel(node_factory, bitcoind): + """We should NOT try to use the pre-splice channel here""" + l1, l2, l3 = node_factory.line_graph(3, + wait_for_announce=True, + fundamount=200000, + opts={'experimental-splicing': None}) + + chan_id = l1.get_channel_id(l2) + funds_result = l1.rpc.addpsbtoutput(100000) + pre_splice_scidd = first_scidd(l1, l2) + + # Pay with fee by subjtracting 5000 from channel balance + result = l1.rpc.splice_init(chan_id, -105000, funds_result['psbt']) + result = l1.rpc.splice_update(chan_id, result['psbt']) + assert(result['commitments_secured'] is False) + result = l1.rpc.splice_update(chan_id, result['psbt']) + assert(result['commitments_secured'] is True) + result = l1.rpc.splice_signed(chan_id, result['psbt']) + + mine_funding_to_announce(bitcoind, + [l1, l2, l3], + num_blocks=6, wait_for_mempool=1) + + wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') + post_splice_scidd = first_scidd(l1, l2) + + # You will use the new scid + route = only_one(l1.rpc.getroutes(l1.info['id'], l2.info['id'], '50000sat', ['auto.localchans'], 100000, 6)['routes']) + assert only_one(route['path'])['short_channel_id_dir'] == post_splice_scidd + + # And you will not be able to route 100001 sats: + with pytest.raises(RpcError, match="We could not find a usable set of paths"): + l1.rpc.getroutes(l1.info['id'], l2.info['id'], '100001sat', ['auto.localchans'], 100000, 6) + + # But l3 would think it can use both, since it doesn't eliminate dying channel! + wait_for(lambda: [c['active'] for c in l3.rpc.listchannels()['channels']] == [True] * 6) + routes = l3.rpc.getroutes(l1.info['id'], l2.info['id'], '200001sat', [], 100000, 6)['routes'] + assert set([only_one(r['path'])['short_channel_id_dir'] for r in routes]) == set([pre_splice_scidd, post_splice_scidd]) diff --git a/tests/utils.py b/tests/utils.py index fc3f18b18da1..d879bf3db092 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -458,6 +458,11 @@ def first_scid(n1, n2): return only_one(n1.rpc.listpeerchannels(n2.info['id'])['channels'])['short_channel_id'] +def first_scidd(n1, n2): + c = only_one(n1.rpc.listpeerchannels(n2.info['id'])['channels']) + return c['short_channel_id'] + '/' + str(c['direction']) + + def basic_fee(feerate, anchor_expected): if anchor_expected: # option_anchor_outputs / option_anchors_zero_fee_htlc_tx From 09ded2838670e8b06e6b22d2b16c02d6f04a0d5c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:14 +1030 Subject: [PATCH 14/22] gossipd: don't shortcut dying phase for local channels. This means that we won't complain to peers which gossip about our channels, but it does mean that our channel graph (like other nodes on the network) will show two channels, not one, for the duration. For this reason, we need askrene to omit local dying channels. Signed-off-by: Rusty Russell --- gossipd/gossmap_manage.c | 9 --------- plugins/askrene/askrene.c | 25 +++++++++++++++++++++++++ tests/test_closing.py | 4 ++-- tests/test_misc.py | 4 ++-- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/gossipd/gossmap_manage.c b/gossipd/gossmap_manage.c index 3227465da5ae..fec0bd9fde6f 100644 --- a/gossipd/gossmap_manage.c +++ b/gossipd/gossmap_manage.c @@ -1317,7 +1317,6 @@ void gossmap_manage_channel_spent(struct gossmap_manage *gm, struct short_channel_id scid) { struct gossmap_chan *chan; - const struct gossmap_node *me; const u8 *msg; struct chan_dying cd; struct gossmap *gossmap = gossmap_manage_get_gossmap(gm); @@ -1326,14 +1325,6 @@ void gossmap_manage_channel_spent(struct gossmap_manage *gm, if (!chan) return; - me = gossmap_find_node(gossmap, &gm->daemon->id); - /* We delete our own channels immediately, since we have local knowledge */ - if (gossmap_nth_node(gossmap, chan, 0) == me - || gossmap_nth_node(gossmap, chan, 1) == me) { - kill_spent_channel(gm, gossmap, scid); - return; - } - /* Is it already dying? It's lightningd re-telling us */ if (channel_already_dying(gm->dying_channels, scid)) return; diff --git a/plugins/askrene/askrene.c b/plugins/askrene/askrene.c index 23e541d75911..fe8f477fbd85 100644 --- a/plugins/askrene/askrene.c +++ b/plugins/askrene/askrene.c @@ -573,6 +573,7 @@ static struct command_result *do_getroutes(struct command *cmd, struct route **routes; struct flow **flows; struct json_stream *response; + const struct gossmap_node *me; /* update the gossmap */ if (gossmap_refresh(askrene->gossmap)) { @@ -593,6 +594,30 @@ static struct command_result *do_getroutes(struct command *cmd, rq->additional_costs = info->additional_costs; rq->maxparts = info->maxparts; + /* We also eliminate any local channels we *know* are dying. + * Most channels get 12 blocks grace in case it's a splice, + * but if it's us, we know about the splice already. */ + me = gossmap_find_node(rq->gossmap, &askrene->my_id); + if (me) { + for (size_t i = 0; i < me->num_chans; i++) { + struct short_channel_id_dir scidd; + const struct gossmap_chan *c = gossmap_nth_chan(rq->gossmap, + me, i, NULL); + if (!gossmap_chan_is_dying(rq->gossmap, c)) + continue; + + scidd.scid = gossmap_chan_scid(rq->gossmap, c); + /* Disable both directions */ + for (scidd.dir = 0; scidd.dir < 2; scidd.dir++) { + bool enabled = false; + gossmap_local_updatechan(localmods, + &scidd, + &enabled, + NULL, NULL, NULL, NULL, NULL); + } + } + } + /* apply selected layers to the localmods */ apply_layers(askrene, rq, &info->source, info->amount, localmods, info->layers, info->local_layer); diff --git a/tests/test_closing.py b/tests/test_closing.py index f3c7dbee06cc..33abde9c8b6e 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -1851,8 +1851,8 @@ def test_onchaind_replay(node_factory, bitcoind): # Wait for nodes to notice the failure, this seach needle is after the # DB commit so we're sure the tx entries in onchaindtxs have been added - l1.daemon.wait_for_log("Deleting channel .* due to the funding outpoint being spent") - l2.daemon.wait_for_log("Deleting channel .* due to the funding outpoint being spent") + l1.daemon.wait_for_log("closing soon due to the funding outpoint being spent") + l2.daemon.wait_for_log("closing soon due to the funding outpoint being spent") # We should at least have the init tx now assert len(l1.db_query("SELECT * FROM channeltxs;")) > 0 diff --git a/tests/test_misc.py b/tests/test_misc.py index e17d4935fc9c..692b689814e1 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1488,8 +1488,8 @@ def no_more_blocks(req): l1.rpc.close(l2.info['id']) bitcoind.generate_block(1, True) - l1.daemon.wait_for_log(r'Deleting channel') - l2.daemon.wait_for_log(r'Deleting channel') + l1.daemon.wait_for_log(r'closing soon due to the funding outpoint being spent') + l2.daemon.wait_for_log(r'closing soon due to the funding outpoint being spent') @pytest.mark.openchannel('v1') From e7311002b8b0fc1368a31373af93716c62e0050b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:14 +1030 Subject: [PATCH 15/22] pytest: remove channel upgrade tests. We removed the functionality, but only disabled the tests. Signed-off-by: Rusty Russell --- tests/test_connection.py | 272 --------------------------------------- 1 file changed, 272 deletions(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 92a2a12f9487..bd887416c24f 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3729,278 +3729,6 @@ def test_openchannel_init_alternate(node_factory, executor): print("nothing to do") -@unittest.skip("experimental-upgrade-protocol TLV fields conflict with splicing TLV fields") -def test_upgrade_statickey(node_factory, executor): - """l1 doesn't have option_static_remotekey, l2 offers it.""" - l1, l2 = node_factory.get_nodes(2, opts=[{'may_reconnect': True, - 'experimental-upgrade-protocol': None, - # This forces us to allow sending non-static-remotekey! - 'dev-any-channel-type': None}, - {'may_reconnect': True, - # This forces us to accept non-static-remotekey! - 'dev-any-channel-type': None, - 'experimental-upgrade-protocol': None}]) - - l1.fundwallet(2000000) - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - - # Now reconnect. - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - - l1.daemon.wait_for_logs([r"They sent current_channel_type \[\]", - r"They offered upgrade to \[12\]"]) - l2.daemon.wait_for_log(r"They sent desired_channel_type \[12\]") - - l1.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - l2.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - - # Make sure it's committed to db! - wait_for(lambda: l1.db_query('SELECT local_static_remotekey_start, remote_static_remotekey_start FROM channels;') == [{'local_static_remotekey_start': 1, 'remote_static_remotekey_start': 1}]) - - # They will consider themselves upgraded. - l1.rpc.disconnect(l2.info['id'], force=True) - # They won't offer upgrade! - assert not l1.daemon.is_in_log("They offered upgrade", - start=l1.daemon.logsearch_start) - l1.daemon.wait_for_log(r"They sent current_channel_type \[12\]") - l2.daemon.wait_for_log(r"They sent desired_channel_type \[12\]") - - -@unittest.skip("experimental-upgrade-protocol TLV fields conflict with splicing TLV fields") -def test_upgrade_statickey_onchaind(node_factory, executor, bitcoind): - """We test penalty before/after, and unilateral before/after""" - l1, l2 = node_factory.get_nodes(2, opts=[{'may_reconnect': True, - 'experimental-upgrade-protocol': None, - # This forces us to allow sending non-static-remotekey! - 'dev-any-channel-type': None, - # We try to cheat! - 'broken_log': r"onchaind-chan#[0-9]*: Could not find resolution for output .*: did \*we\* cheat\?"}, - {'may_reconnect': True, - # This forces us to allow non-static-remotekey! - 'dev-any-channel-type': None, - 'experimental-upgrade-protocol': None}]) - - l1.fundwallet(FUNDAMOUNT + 1000) - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - bitcoind.generate_block(1, wait_for_mempool=1) - wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') - - # TEST 1: Cheat from pre-upgrade. - tx = l1.rpc.dev_sign_last_tx(l2.info['id'])['tx'] - - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - l1.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - - # Make sure another commitment happens, sending failed payment. - routestep = { - 'amount_msat': 1, - 'id': l2.info['id'], - 'delay': 5, - 'channel': first_scid(l1, l2) - } - l1.rpc.sendpay([routestep], '00' * 32, payment_secret='00' * 32) - with pytest.raises(RpcError, match=r'WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS'): - l1.rpc.waitsendpay('00' * 32) - - # Make sure l2 gets REVOKE_AND_ACK from previous. - l2.daemon.wait_for_log('peer_in WIRE_UPDATE_ADD_HTLC') - l2.daemon.wait_for_log('peer_out WIRE_REVOKE_AND_ACK') - l2.daemon.wait_for_log('peer_in WIRE_REVOKE_AND_ACK') - - # Pre-statickey penalty works. - bitcoind.rpc.sendrawtransaction(tx) - bitcoind.generate_block(1) - - _, txid, blocks = l2.wait_for_onchaind_tx('OUR_PENALTY_TX', - 'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM') - assert blocks == 0 - - bitcoind.generate_block(100, wait_for_mempool=txid) - # This works even if they disconnect and listpeerchannels() is empty: - wait_for(lambda: l1.rpc.listpeerchannels()['channels'] == []) - wait_for(lambda: l2.rpc.listpeerchannels()['channels'] == []) - - # TEST 2: Cheat from post-upgrade. - l1.fundwallet(FUNDAMOUNT + 1000) - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - - l1.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - l2.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - bitcoind.generate_block(1, wait_for_mempool=1) - wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') - - l1.pay(l2, 1000000) - - # We will try to cheat later. - tx = l1.rpc.dev_sign_last_tx(l2.info['id'])['tx'] - - l1.pay(l2, 1000000) - - # Pre-statickey penalty works. - bitcoind.rpc.sendrawtransaction(tx) - bitcoind.generate_block(1) - - _, txid, blocks = l2.wait_for_onchaind_tx('OUR_PENALTY_TX', - 'THEIR_REVOKED_UNILATERAL/DELAYED_CHEAT_OUTPUT_TO_THEM') - assert blocks == 0 - - bitcoind.generate_block(100, wait_for_mempool=txid) - # This works even if they disconnect and listpeers() is empty: - wait_for(lambda: len(l1.rpc.listpeerchannels()['channels']) == 0) - wait_for(lambda: len(l2.rpc.listpeerchannels()['channels']) == 0) - - # TEST 3: Unilateral close from pre-upgrade - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.fundwallet(FUNDAMOUNT + 1000) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - bitcoind.generate_block(1, wait_for_mempool=1) - wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') - - # Give them both something for onchain close. - l1.pay(l2, 1000000) - - # Make sure it's completely quiescent. - l1.daemon.wait_for_log("chan#3: Removing out HTLC 0 state RCVD_REMOVE_ACK_REVOCATION FULFILLED") - - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - l1.daemon.wait_for_log('option_static_remotekey enabled at 3/3') - - # But this is the *pre*-update commit tx! - l2.stop() - l1.rpc.close(l2.info['id'], unilateraltimeout=1) - bitcoind.generate_block(1, wait_for_mempool=1) - l2.start() - - # They should both handle it fine. - _, txid, blocks = l1.wait_for_onchaind_tx('OUR_DELAYED_RETURN_TO_WALLET', - 'OUR_UNILATERAL/DELAYED_OUTPUT_TO_US') - assert blocks == 4 - l2.daemon.wait_for_logs(['Ignoring output .*: THEIR_UNILATERAL/OUTPUT_TO_US', - 'Ignoring output .*: THEIR_UNILATERAL/DELAYED_OUTPUT_TO_THEM']) - bitcoind.generate_block(4) - bitcoind.generate_block(100, wait_for_mempool=txid) - - # This works even if they disconnect and listpeerchannels() is empty: - wait_for(lambda: len(l1.rpc.listpeerchannels()['channels']) == 0) - wait_for(lambda: len(l2.rpc.listpeerchannels()['channels']) == 0) - - # TEST 4: Unilateral close from post-upgrade - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - l1.daemon.wait_for_log('option_static_remotekey enabled at 1/1') - - bitcoind.generate_block(1, wait_for_mempool=1) - wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') - - # Move to static_remotekey. - l1.pay(l2, 1000000) - - l2.stop() - l1.rpc.close(l2.info['id'], unilateraltimeout=1) - bitcoind.generate_block(1, wait_for_mempool=1) - l2.start() - - # They should both handle it fine. - _, txid, blocks = l1.wait_for_onchaind_tx('OUR_DELAYED_RETURN_TO_WALLET', - 'OUR_UNILATERAL/DELAYED_OUTPUT_TO_US') - assert blocks == 4 - l2.daemon.wait_for_logs(['Ignoring output .*: THEIR_UNILATERAL/OUTPUT_TO_US', - 'Ignoring output .*: THEIR_UNILATERAL/DELAYED_OUTPUT_TO_THEM']) - - bitcoind.generate_block(4) - bitcoind.generate_block(100, wait_for_mempool=txid) - - # This works even if they disconnect and listpeerchannels() is empty: - wait_for(lambda: len(l2.rpc.listpeerchannels()['channels']) == 0) - - -@unittest.skip("experimental-upgrade-protocol TLV fields conflict with splicing TLV fields") -def test_upgrade_statickey_fail(node_factory, executor, bitcoind): - """We reconnect at all points during retransmit, and we won't upgrade.""" - l1_disconnects = ['-WIRE_COMMITMENT_SIGNED', - '-WIRE_REVOKE_AND_ACK'] - l2_disconnects = ['-WIRE_REVOKE_AND_ACK', - '-WIRE_COMMITMENT_SIGNED'] - - l1, l2 = node_factory.get_nodes(2, opts=[{'may_reconnect': True, - 'dev-no-reconnect': None, - 'disconnect': l1_disconnects, - # This allows us to send non-static-remotekey! - 'dev-any-channel-type': None, - 'experimental-upgrade-protocol': None, - # Don't have feerate changes! - 'feerates': (7500, 7500, 7500, 7500)}, - {'may_reconnect': True, - 'dev-no-reconnect': None, - 'experimental-upgrade-protocol': None, - # This forces us to accept non-static-remotekey! - 'dev-any-channel-type': None, - 'disconnect': l2_disconnects, - 'plugin': os.path.join(os.getcwd(), 'tests/plugins/hold_htlcs.py'), - 'hold-time': 10000, - 'hold-result': 'fail'}]) - l1.fundwallet(FUNDAMOUNT + 1000) - l1.rpc.connect(l2.info['id'], 'localhost', port=l2.port) - l1.rpc.fundchannel(l2.info['id'], 'all', channel_type=[]) - bitcoind.generate_block(1, wait_for_mempool=1) - wait_for(lambda: only_one(l1.rpc.listpeerchannels()['channels'])['state'] == 'CHANNELD_NORMAL') - - # This HTLC will fail - l1.rpc.sendpay([{'amount_msat': 1000, 'id': l2.info['id'], 'delay': 5, 'channel': first_scid(l1, l2)}], '00' * 32, payment_secret='00' * 32) - - # Each one should cause one disconnection, no upgrade. - for d in l1_disconnects + l2_disconnects: - l1.daemon.wait_for_log('Peer connection lost') - l2.daemon.wait_for_log('Peer connection lost') - assert not l1.daemon.is_in_log('option_static_remotekey enabled') - assert not l2.daemon.is_in_log('option_static_remotekey enabled') - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - line1 = l1.daemon.wait_for_log('No upgrade') - line2 = l2.daemon.wait_for_log('No upgrade') - - # On the last reconnect, it retransmitted revoke_and_ack. - assert re.search('No upgrade: we retransmitted', line1) - assert re.search('No upgrade: pending changes', line2) - - # Make sure we already skip the first of these. - l1.daemon.wait_for_log('billboard perm: Reconnected, and reestablished.') - assert 'option_static_remotekey' not in only_one(l1.rpc.listpeerchannels()['channels'])['features'] - assert 'option_static_remotekey' not in only_one(l2.rpc.listpeerchannels()['channels'])['features'] - - sleeptime = 1 - while True: - # Now when we reconnect, despite having an HTLC, we're quiescent. - l1.rpc.disconnect(l2.info['id'], force=True) - l1.rpc.connect(l2.info['id'], 'localhost', l2.port) - - oldstart = l1.daemon.logsearch_start - l1.daemon.wait_for_log('billboard perm: Reconnected, and reestablished.') - if not l1.daemon.is_in_log('No upgrade:', start=oldstart): - break - - # Give it some processing time before reconnect... - time.sleep(sleeptime) - sleeptime += 1 - - l1.daemon.logsearch_start = oldstart - assert l1.daemon.wait_for_log('option_static_remotekey enabled at 2/2') - assert l2.daemon.wait_for_log('option_static_remotekey enabled at 2/2') - assert 'option_static_remotekey' in only_one(l1.rpc.listpeerchannels()['channels'])['features'] - assert 'option_static_remotekey' in only_one(l2.rpc.listpeerchannels()['channels'])['features'] - - def test_quiescence(node_factory, executor): l1, l2 = node_factory.line_graph(2) From 4faba50bdf7b11a449b80a3f026bd728aab3d9dc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:14 +1030 Subject: [PATCH 16/22] pytest: move benchmark in test_connection.py to tests/benchmarks.py Signed-off-by: Rusty Russell --- tests/benchmark.py | 36 +++++++++++++++++++++++++++++++++++- tests/test_connection.py | 35 ----------------------------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/tests/benchmark.py b/tests/benchmark.py index ec0062e82f7a..97d26f6071c1 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -1,7 +1,8 @@ from concurrent import futures from fixtures import * # noqa: F401,F403 +from pyln.client import RpcError from tqdm import tqdm -from utils import (wait_for, TIMEOUT) +from utils import (wait_for, TIMEOUT, only_one) import os @@ -228,3 +229,36 @@ def test_spam_listcommands(node_factory, bitcoind, benchmark): # This calls "listinvoice" 100,000 times (which doesn't need a transaction commit) benchmark(l1.rpc.spamlistcommand, 100_000) + + +def test_payment_speed(node_factory, benchmark): + """This makes sure we don't screw up nagle handling. + + Normally: + Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations + test_payment_speed 16.3587 40.4925 27.4874 5.5512 27.7885 8.9291 9;0 36.3803 33 1 + + Without TCP_NODELAY: + Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations + test_payment_speed 153.7132 163.2027 158.6747 3.4059 158.5219 6.3745 3;0 6.3022 9 1 + """ + l1 = get_bench_node(node_factory, extra_options={'commit-time': 0}) + l2 = get_bench_node(node_factory, extra_options={'commit-time': 0}) + + node_factory.join_nodes([l1, l2]) + + scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id'] + routestep = { + 'amount_msat': 100, + 'id': l2.info['id'], + 'delay': 5, + 'channel': scid + } + + def onepay(l1, routestep): + phash = random.randbytes(32).hex() + l1.rpc.sendpay([routestep], phash) + with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): + l1.rpc.waitsendpay(phash) + + benchmark(onepay, l1, routestep) diff --git a/tests/test_connection.py b/tests/test_connection.py index bd887416c24f..37530022b966 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -4579,41 +4579,6 @@ def test_no_delay(node_factory): assert end < start + 100 * 0.5 -@unittest.skipIf(os.getenv('TEST_BENCH', '0') == '0', "For profiling") -def test_bench(node_factory): - """Is our Nagle disabling for critical messages working?""" - l1, l2 = node_factory.get_nodes(2, opts={'start': False, - 'commit-time': 0}) - - # memleak detection plays havoc with profiles. - del l1.daemon.env["LIGHTNINGD_DEV_MEMLEAK"] - del l2.daemon.env["LIGHTNINGD_DEV_MEMLEAK"] - - l1.start() - l2.start() - node_factory.join_nodes([l1, l2]) - - scid = only_one(l1.rpc.listpeerchannels()['channels'])['short_channel_id'] - routestep = { - 'amount_msat': 100, - 'id': l2.info['id'], - 'delay': 5, - 'channel': scid - } - - start = time.time() - # If we were stupid enough to leave Nagle enabled, this would add 200ms - # seconds delays each way! - for _ in range(1000): - phash = random.randbytes(32).hex() - l1.rpc.sendpay([routestep], phash) - with pytest.raises(RpcError, match="WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS"): - l1.rpc.waitsendpay(phash) - end = time.time() - duration = end - start - assert duration == 0 - - def test_listpeerchannels_by_scid(node_factory): l1, l2, l3 = node_factory.line_graph(3, announce_channels=False) From 97c81b2bb9dee359cb15e737846ad7a58f604b54 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:15 +1030 Subject: [PATCH 17/22] pytest: give test_xpay_maxfee longer, as it can time out under CI. ``` > ret = l1.rpc.xpay(invstring=inv, maxfee=maxfee) tests/test_xpay.py:585: ... > raise RpcError(method, payload, resp['error']) E pyln.client.lightning.RpcError: RPC call failed: method: xpay, payload: {'invstring': 'lnbcrt1m1p5n5wdzsp5qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqpp52mu6842a26hs40xxgzscflm4smk5yjctqgf7hhrwhx7dh2492vzsdp22pshj6twvusxummyv5sr2wfqwa5hg6pqd4shsen9v5cqpj9qyyqqqj9kvjvrzy0ygdsfsskjtss0xrkrt7lq8jyrgzvtvdw5y9xqy0v25ddxd787c9ym36udm876lyhevznj8j9qzk0r7x03xm0akvq6ltwcq7vm7tk', 'maxfee': 57966}, error: {'code': 209, 'message': "Failed after 4 attempts. We got temporary_channel_failure for 59x81x28707/1, assuming it can't carry 49498813msat. Then routing for remaining 49498813msat failed: linear_routes: timed out after deadline"} ... lightningd-1 2025-12-11T03:25:41.972Z DEBUG plugin-cln-askrene: notify msg debug: get_routes failed after 15572 ms ``` Signed-off-by: Rusty Russell --- tests/test_xpay.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_xpay.py b/tests/test_xpay.py index 949584121fcd..20ba34933ff4 100644 --- a/tests/test_xpay.py +++ b/tests/test_xpay.py @@ -557,7 +557,9 @@ def test_xpay_maxfee(node_factory, bitcoind, chainparams): opts=[{'gossip_store_file': outfile.name, 'subdaemon': 'channeld:../tests/plugins/channeld_fakenet', 'allow_warning': True, - 'dev-throttle-gossip': None}, + 'dev-throttle-gossip': None, + # This can be more than 10 seconds under CI! + 'askrene-timeout': 60}, {'allow_bad_gossip': True}]) # l1 needs to know l2's shaseed for the channel so it can make revocations From 3ca4e32993a0704b91997c2df963020b58be8c76 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:15 +1030 Subject: [PATCH 18/22] pytest: fix timing flake in test_invoice_expiry. Under Postgres, this actually takes more than 2 seconds, so w2 really has timed out already: ``` time.sleep(2) # total 2 assert not w1.done() > assert not w2.done() E assert not True E + where True = done() E + where done = .done tests/test_invoices.py:420: AssertionError ``` So space the timeouts out more, and sleep one second too short; the .result() (which sleeps) will catch up if we were extremely slow. Signed-off-by: Rusty Russell --- tests/test_invoices.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_invoices.py b/tests/test_invoices.py index 6f4bc3ca612f..a3db715adf02 100644 --- a/tests/test_invoices.py +++ b/tests/test_invoices.py @@ -407,9 +407,10 @@ def test_invoice_expiry(node_factory, executor): # Test expiration waiting. # The second invoice created expires first. - l2.rpc.invoice('any', 'inv1', 'description', 10) - l2.rpc.invoice('any', 'inv2', 'description', 4) - l2.rpc.invoice('any', 'inv3', 'description', 16) + # Times should be long enough even for our terrible CI runners! + l2.rpc.invoice('any', 'inv1', 'description', 16) + l2.rpc.invoice('any', 'inv2', 'description', 10) + l2.rpc.invoice('any', 'inv3', 'description', 22) # Check waitinvoice correctly waits w1 = executor.submit(l2.rpc.waitinvoice, 'inv1') @@ -419,19 +420,18 @@ def test_invoice_expiry(node_factory, executor): assert not w1.done() assert not w2.done() assert not w3.done() - time.sleep(4) # total 6 + time.sleep(7) # total 9 assert not w1.done() - with pytest.raises(RpcError): + with pytest.raises(RpcError): # total 10 w2.result() assert not w3.done() - time.sleep(6) # total 12 - with pytest.raises(RpcError): + time.sleep(5) # total 15 + with pytest.raises(RpcError): # total 16 w1.result() assert not w3.done() - time.sleep(8) # total 20 with pytest.raises(RpcError): w3.result() From fc641bb66abea6b29c66dceea50dc8b94dd573c8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 11 Dec 2025 17:54:15 +1030 Subject: [PATCH 19/22] ci: don't run shard 2/12 ubsan without parallel. 3974806e5af added this: CI: Try not running group 2/10 UBSAN in parallel. It's being killed with signal 143, which means docker isn't happy; too much memory consumption? But since we're now at 12 groups, that probably doesn't apply (it might not have even before, in the two years since that commit since so may things have been added). And it caused this shard to take over 2 hours and timed out. Signed-off-by: Rusty Russell --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 51a02bf2e202..b6f65611dba4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -512,7 +512,7 @@ jobs: - NAME: ASan/UBSan (01/12) PYTEST_OPTS: --test-group=1 --test-group-count=12 - NAME: ASan/UBSan (02/12) - PYTEST_OPTS: --test-group=2 --test-group-count=12 -n 1 + PYTEST_OPTS: --test-group=2 --test-group-count=12 - NAME: ASan/UBSan (03/12) PYTEST_OPTS: --test-group=3 --test-group-count=12 - NAME: ASan/UBSan (04/12) From fa42981c219017c28fe04cbd1fb043d7659cf583 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Dec 2025 10:02:52 +1030 Subject: [PATCH 20/22] pytest: disable remaining flaky and skip markers to see what else fails. Signed-off-by: Rusty Russell --- tests/test_closing.py | 2 +- tests/test_coinmoves.py | 1 - tests/test_invoices.py | 1 - tests/test_plugin.py | 1 - tests/test_reckless.py | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_closing.py b/tests/test_closing.py index 33abde9c8b6e..47cc28d8077c 100644 --- a/tests/test_closing.py +++ b/tests/test_closing.py @@ -3437,7 +3437,7 @@ def test_closing_higherfee(node_factory, bitcoind, executor, anchors): wait_for(lambda: l2.rpc.listpeerchannels()['channels'][0]['state'] == 'CLOSINGD_COMPLETE') -@unittest.skipIf(True, "Test is extremely flaky") +@pytest.mark.flaky(reruns=3) def test_htlc_rexmit_while_closing(node_factory, executor): """Retranmitting an HTLC revocation while shutting down should work""" # FIXME: This should be in lnprototest! UNRELIABLE. diff --git a/tests/test_coinmoves.py b/tests/test_coinmoves.py index dc5abb044ce5..291f857ba6b3 100644 --- a/tests/test_coinmoves.py +++ b/tests/test_coinmoves.py @@ -681,7 +681,6 @@ def test_coinmoves_unilateral_htlc_before_included(node_factory, bitcoind): check_balances(l1, l2, fundchannel['channel_id'], 0) -@pytest.mark.flaky(reruns=5) @pytest.mark.openchannel('v1') @pytest.mark.openchannel('v2') @unittest.skipIf(TEST_NETWORK != 'regtest', "Amounts are for regtest.") diff --git a/tests/test_invoices.py b/tests/test_invoices.py index a3db715adf02..37e6695be7ab 100644 --- a/tests/test_invoices.py +++ b/tests/test_invoices.py @@ -927,7 +927,6 @@ def test_invoices_wait_db_migration(node_factory, bitcoind): @unittest.skipIf(os.getenv('TEST_DB_PROVIDER', 'sqlite3') != 'sqlite3', "This test is based on a sqlite3 snapshot") @unittest.skipIf(TEST_NETWORK != 'regtest', "The DB migration is network specific due to the chain var.") -@pytest.mark.flaky(reruns=5) def test_invoice_botched_migration(node_factory, chainparams): """Test for grubles' case, where they ran successfully with the wrong var: they have *both* last_invoice_created_index *and *last_invoices_created_index* (this can happen if invoice id 1 was deleted, so they didn't die on invoice creation): Error executing statement: wallet/db.c:1684: UPDATE vars SET name = 'last_invoices_created_index' WHERE name = 'last_invoice_created_index': UNIQUE constraint failed: vars.name diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 32fb94506d2a..328ec7d06081 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -4309,7 +4309,6 @@ def test_plugin_nostart(node_factory): assert [p['name'] for p in l1.rpc.plugin_list()['plugins'] if 'badinterp' in p['name']] == [] -@unittest.skip("A bit flaky, but when breaks, it is costing us 2h of CI time") def test_plugin_startdir_lol(node_factory): """Though we fail to start many of them, we don't crash!""" l1 = node_factory.get_node(broken_log='.*') diff --git a/tests/test_reckless.py b/tests/test_reckless.py index 275960d576fb..311ed1ae4c1a 100644 --- a/tests/test_reckless.py +++ b/tests/test_reckless.py @@ -351,7 +351,6 @@ def test_tag_install(node_factory): header = line -@pytest.mark.flaky(reruns=5) @pytest.mark.slow_test def test_reckless_uv_install(node_factory): node = get_reckless_node(node_factory) From d0e785a20bdd938e6b0b77826b5e5af8f8000a43 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Dec 2025 10:03:07 +1030 Subject: [PATCH 21/22] pytest: don't run test_hook_in_use under VALGRIND on CI. It's not reliable: ``` # We should have deferred hook update at least once! > l2.daemon.wait_for_log("UNUSUAL plugin-dep_b.py: Deferring registration of hook htlc_accepted until it's not in use.") tests/test_plugin.py:2646: ... if self.is_in_log(r): print("({} was previously in logs!)".format(r)) > raise TimeoutError('Unable to find "{}" in logs.'.format(exs)) E TimeoutError: Unable to find "[re.compile("UNUSUAL plugin-dep_b.py: Deferring registration of hook htlc_accepted until it's not in use.")]" in logs. ``` Signed-off-by: Rusty Russell --- tests/test_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_plugin.py b/tests/test_plugin.py index 328ec7d06081..67d2d0f22121 100644 --- a/tests/test_plugin.py +++ b/tests/test_plugin.py @@ -2614,6 +2614,7 @@ def test_htlc_accepted_hook_failonion(node_factory): l1.rpc.pay(inv) +@pytest.mark.slow_test # VALGRIND running generally too slow to trigger race we need. def test_hook_in_use(node_factory): """If a hook is in use when we add a plugin to it, we have to defer""" dep_a = os.path.join(os.path.dirname(__file__), 'plugins/dep_a.py') From d21a24cf2379b5a7de699ce992781824b55f1684 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 12 Dec 2025 10:03:07 +1030 Subject: [PATCH 22/22] lightningd: fix occasional memleak when we detach subd from channel. Do this by setting notleak when we do the detach! ``` **BROKEN** lightningd: MEMLEAK: 0x60f0000bbb38 **BROKEN** lightningd: label=ccan/ccan/io/io.c:92:struct io_conn **BROKEN** lightningd: alloc: **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/tal/tal.c:488 (tal_alloc_) **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/io/io.c:92 (io_new_conn_) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/subd.c:781 (new_subd) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/subd.c:835 (new_channel_subd_) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/channel_control.c:1715 (peer_start_channeld) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/peer_control.c:1390 (connect_activate_subd) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/peer_control.c:1516 (peer_connected_hook_final) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/plugin_hook.c:243 (hook_done) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/plugin_hook.c:343 (plugin_hook_call_next) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/plugin_hook.c:299 (plugin_hook_callback) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/plugin.c:701 (plugin_response_handle) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/plugin.c:790 (plugin_read_json) **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/io/io.c:60 (next_plan) **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/io/io.c:422 (do_plan) **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/io/io.c:439 (io_ready) **BROKEN** lightningd: /home/runner/work/lightning/lightning/ccan/ccan/io/poll.c:470 (io_loop) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/io_loop_with_timers.c:22 (io_loop_with_timers) **BROKEN** lightningd: /home/runner/work/lightning/lightning/lightningd/lightningd.c:1492 (main) **BROKEN** lightningd: ../sysdeps/nptl/libc_start_call_main.h:58 (__libc_start_call_main) **BROKEN** lightningd: ../csu/libc-start.c:392 (__libc_start_main_impl) **BROKEN** lightningd: parents: **BROKEN** lightningd: lightningd/lightningd.c:108:struct lightningd ``` Signed-off-by: Rusty Russell --- lightningd/subd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lightningd/subd.c b/lightningd/subd.c index 744d615296c5..05e08c08205a 100644 --- a/lightningd/subd.c +++ b/lightningd/subd.c @@ -438,6 +438,8 @@ static bool handle_peer_error(struct subd *sd, const u8 *msg, int fds[1]) /* Don't free sd; we may be about to free channel. */ sd->channel = NULL; + /* While it's cleaning up, this is not a leak! */ + notleak(sd); sd->errcb(channel, peer_fd, desc, err_for_them, disconnect, warning); return true; } @@ -641,6 +643,8 @@ static void destroy_subd(struct subd *sd) /* Clear any transient messages in billboard */ sd->billboardcb(channel, false, NULL); + /* While it's cleaning up, this is not a leak! */ + notleak(sd); sd->channel = NULL; /* We can be freed both inside msg handling, or spontaneously. */ @@ -928,11 +932,6 @@ void subd_release_channel(struct subd *owner, const void *channel) assert(owner->channel == channel); owner->channel = NULL; tal_free(owner); - } else { - /* Caller has reassigned channel->owner, so there's no pointer - * to this subd owner while it's freeing itself. If we - * ask memleak right now, it will complain! */ - notleak(owner); } }