Skip to content

Commit 0f23796

Browse files
committed
Fix races when removing per-peer state from KVStore in LSPS2/5
If we note that a peer should be removed in LSPS2/5 handling, we need to make sure that the peer wasn't re-added between dropping its state in memory and going to remove its state from disk. If it is, we need to overwrite the current on-disk state instead.
1 parent c202d35 commit 0f23796

File tree

2 files changed

+97
-42
lines changed

2 files changed

+97
-42
lines changed

lightning-liquidity/src/lsps2/service.rs

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,7 +1603,7 @@ where
16031603
) -> Result<(), lightning::io::Error> {
16041604
let fut = {
16051605
let outer_state_lock = self.per_peer_state.read().unwrap();
1606-
let encoded = match outer_state_lock.get(&counterparty_node_id) {
1606+
match outer_state_lock.get(&counterparty_node_id) {
16071607
None => {
16081608
// We dropped the peer state by now.
16091609
return Ok(());
@@ -1615,18 +1615,19 @@ where
16151615
return Ok(());
16161616
} else {
16171617
peer_state_lock.needs_persist = false;
1618-
peer_state_lock.encode()
1618+
let key = counterparty_node_id.to_string();
1619+
let encoded = peer_state_lock.encode();
1620+
// Begin the write with the entry lock held. This avoids racing with
1621+
// potentially-in-flight `persist` calls writing state for the same peer.
1622+
self.kv_store.write(
1623+
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1624+
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1625+
&key,
1626+
encoded,
1627+
)
16191628
}
16201629
},
1621-
};
1622-
let key = counterparty_node_id.to_string();
1623-
1624-
self.kv_store.write(
1625-
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1626-
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1627-
&key,
1628-
encoded,
1629-
)
1630+
}
16301631
};
16311632

16321633
fut.await.map_err(|e| {
@@ -1648,8 +1649,10 @@ where
16481649
let mut need_persist = Vec::new();
16491650

16501651
{
1651-
let mut outer_state_lock = self.per_peer_state.write().unwrap();
1652-
outer_state_lock.retain(|counterparty_node_id, inner_state_lock| {
1652+
// First build a list of peers to persist and prune with the read lock. This allows
1653+
// us to avoid the write lock unless we actually need to remove a node.
1654+
let outer_state_lock = self.per_peer_state.read().unwrap();
1655+
for (counterparty_node_id, inner_state_lock) in outer_state_lock.iter() {
16531656
let mut peer_state_lock = inner_state_lock.lock().unwrap();
16541657
peer_state_lock.prune_expired_request_state();
16551658
let is_prunable = peer_state_lock.is_prunable();
@@ -1658,8 +1661,7 @@ where
16581661
} else if peer_state_lock.needs_persist {
16591662
need_persist.push(*counterparty_node_id);
16601663
}
1661-
!is_prunable
1662-
});
1664+
}
16631665
}
16641666

16651667
for counterparty_node_id in need_persist.into_iter() {
@@ -1668,14 +1670,39 @@ where
16681670
}
16691671

16701672
for counterparty_node_id in need_remove {
1671-
let key = counterparty_node_id.to_string();
1672-
self.kv_store
1673-
.remove(
1674-
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1675-
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1676-
&key,
1677-
)
1678-
.await?;
1673+
let mut future_opt = None;
1674+
{
1675+
// We need to take the `per_peer_state` write lock to remove an entry, but also
1676+
// have to hold it until after the `remove` call returns (but not through
1677+
// future completion) to ensure that writes for the peer's state are
1678+
// well-ordered with other `persist_peer_state` calls even across the removal
1679+
// itself.
1680+
let mut per_peer_state = self.per_peer_state.write().unwrap();
1681+
if let Entry::Occupied(mut entry) = per_peer_state.entry(counterparty_node_id) {
1682+
let state = entry.get_mut().get_mut().unwrap();
1683+
if state.is_prunable() {
1684+
entry.remove();
1685+
let key = counterparty_node_id.to_string();
1686+
future_opt = Some(self.kv_store.remove(
1687+
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1688+
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1689+
&key,
1690+
));
1691+
} else {
1692+
// If the peer got new state, force a re-persist of the current state.
1693+
state.needs_persist = true;
1694+
}
1695+
} else {
1696+
// This should never happen, we can only have one `persist` call
1697+
// in-progress at once and map entries are only removed by it.
1698+
debug_assert!(false);
1699+
}
1700+
}
1701+
if let Some(future) = future_opt {
1702+
future.await?;
1703+
} else {
1704+
self.persist_peer_state(counterparty_node_id).await?;
1705+
}
16791706
}
16801707

16811708
Ok(())

lightning-liquidity/src/lsps5/service.rs

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use crate::message_queue::MessageQueue;
2020
use crate::persist::{
2121
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE, LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
2222
};
23+
use crate::prelude::hash_map::Entry;
2324
use crate::prelude::*;
2425
use crate::sync::{Arc, Mutex, RwLock, RwLockWriteGuard};
2526
use crate::utils::time::TimeProvider;
@@ -220,6 +221,8 @@ where
220221

221222
let key = counterparty_node_id.to_string();
222223

224+
// Begin the write with the `per_peer_state` write lock held to avoid racing with
225+
// potentially-in-flight `persist` calls writing state for the same peer.
223226
self.kv_store.write(
224227
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
225228
LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
@@ -244,36 +247,61 @@ where
244247
// time.
245248
let mut need_remove = Vec::new();
246249
let mut need_persist = Vec::new();
250+
251+
self.check_prune_stale_webhooks(&mut self.per_peer_state.write().unwrap());
247252
{
248-
let mut outer_state_lock = self.per_peer_state.write().unwrap();
249-
self.check_prune_stale_webhooks(&mut outer_state_lock);
253+
let outer_state_lock = self.per_peer_state.read().unwrap();
250254

251-
outer_state_lock.retain(|client_id, peer_state| {
255+
for (client_id, peer_state) in outer_state_lock.iter() {
252256
let is_prunable = peer_state.is_prunable();
253257
let has_open_channel = self.client_has_open_channel(client_id);
254258
if is_prunable && !has_open_channel {
255259
need_remove.push(*client_id);
256260
} else if peer_state.needs_persist {
257261
need_persist.push(*client_id);
258262
}
259-
!is_prunable || has_open_channel
260-
});
261-
};
263+
}
264+
}
262265

263-
for counterparty_node_id in need_persist.into_iter() {
264-
debug_assert!(!need_remove.contains(&counterparty_node_id));
265-
self.persist_peer_state(counterparty_node_id).await?;
266+
for client_id in need_persist.into_iter() {
267+
debug_assert!(!need_remove.contains(&client_id));
268+
self.persist_peer_state(client_id).await?;
266269
}
267270

268-
for counterparty_node_id in need_remove {
269-
let key = counterparty_node_id.to_string();
270-
self.kv_store
271-
.remove(
272-
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
273-
LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
274-
&key,
275-
)
276-
.await?;
271+
for client_id in need_remove {
272+
let mut future_opt = None;
273+
{
274+
// We need to take the `per_peer_state` write lock to remove an entry, but also
275+
// have to hold it until after the `remove` call returns (but not through
276+
// future completion) to ensure that writes for the peer's state are
277+
// well-ordered with other `persist_peer_state` calls even across the removal
278+
// itself.
279+
let mut per_peer_state = self.per_peer_state.write().unwrap();
280+
if let Entry::Occupied(mut entry) = per_peer_state.entry(client_id) {
281+
let state = entry.get_mut();
282+
if state.is_prunable() && !self.client_has_open_channel(&client_id) {
283+
entry.remove();
284+
let key = client_id.to_string();
285+
future_opt = Some(self.kv_store.remove(
286+
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
287+
LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
288+
&key,
289+
));
290+
} else {
291+
// If the peer was re-added, force a re-persist of the current state.
292+
state.needs_persist = true;
293+
}
294+
} else {
295+
// This should never happen, we can only have one `persist` call
296+
// in-progress at once and map entries are only removed by it.
297+
debug_assert!(false);
298+
}
299+
}
300+
if let Some(future) = future_opt {
301+
future.await?;
302+
} else {
303+
self.persist_peer_state(client_id).await?;
304+
}
277305
}
278306

279307
Ok(())
@@ -761,7 +789,7 @@ impl PeerState {
761789
});
762790
}
763791

764-
fn is_prunable(&mut self) -> bool {
792+
fn is_prunable(&self) -> bool {
765793
self.webhooks.is_empty()
766794
}
767795
}

0 commit comments

Comments
 (0)