Skip to content

Commit a9ddf3f

Browse files
committed
Ensure mutual exclusion in LSPS2/5 persistence
There are various race conditions between `persist` calls on the LSPS2 and LSPS5 services. Thus, we simply ensure that no two `persist` calls can be in-flight at the same time, ensuring that any new state updates between them will ultimately be persisted by re-persisting from the top in the originally-running `persist` call if another is started before it finishes.
1 parent 0f23796 commit a9ddf3f

File tree

2 files changed

+132
-94
lines changed

2 files changed

+132
-94
lines changed

lightning-liquidity/src/lsps2/service.rs

Lines changed: 65 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ where
593593
peer_by_channel_id: RwLock<HashMap<ChannelId, PublicKey>>,
594594
total_pending_requests: AtomicUsize,
595595
config: LSPS2ServiceConfig,
596+
persistence_in_flight: AtomicUsize,
596597
}
597598

598599
impl<CM: Deref, K: Deref + Clone> LSPS2ServiceHandler<CM, K>
@@ -640,6 +641,7 @@ where
640641
peer_by_intercept_scid: RwLock::new(peer_by_intercept_scid),
641642
peer_by_channel_id: RwLock::new(peer_by_channel_id),
642643
total_pending_requests: AtomicUsize::new(0),
644+
persistence_in_flight: AtomicUsize::new(0),
643645
channel_manager,
644646
kv_store,
645647
config,
@@ -1645,64 +1647,80 @@ where
16451647
// introduce some batching to upper-bound the number of requests inflight at any given
16461648
// time.
16471649

1648-
let mut need_remove = Vec::new();
1649-
let mut need_persist = Vec::new();
1650+
if self.persistence_in_flight.fetch_add(1, Ordering::AcqRel) > 0 {
1651+
// If we're not the first event processor to get here, just return early, the increment
1652+
// we just did will be treated as "go around again" at the end.
1653+
return Ok(());
1654+
}
16501655

1651-
{
1652-
// First build a list of peers to persist and prune with the read lock. This allows
1653-
// us to avoid the write lock unless we actually need to remove a node.
1654-
let outer_state_lock = self.per_peer_state.read().unwrap();
1655-
for (counterparty_node_id, inner_state_lock) in outer_state_lock.iter() {
1656-
let mut peer_state_lock = inner_state_lock.lock().unwrap();
1657-
peer_state_lock.prune_expired_request_state();
1658-
let is_prunable = peer_state_lock.is_prunable();
1659-
if is_prunable {
1660-
need_remove.push(*counterparty_node_id);
1661-
} else if peer_state_lock.needs_persist {
1662-
need_persist.push(*counterparty_node_id);
1656+
loop {
1657+
let mut need_remove = Vec::new();
1658+
let mut need_persist = Vec::new();
1659+
1660+
{
1661+
// First build a list of peers to persist and prune with the read lock. This allows
1662+
// us to avoid the write lock unless we actually need to remove a node.
1663+
let outer_state_lock = self.per_peer_state.read().unwrap();
1664+
for (counterparty_node_id, inner_state_lock) in outer_state_lock.iter() {
1665+
let mut peer_state_lock = inner_state_lock.lock().unwrap();
1666+
peer_state_lock.prune_expired_request_state();
1667+
let is_prunable = peer_state_lock.is_prunable();
1668+
if is_prunable {
1669+
need_remove.push(*counterparty_node_id);
1670+
} else if peer_state_lock.needs_persist {
1671+
need_persist.push(*counterparty_node_id);
1672+
}
16631673
}
16641674
}
1665-
}
16661675

1667-
for counterparty_node_id in need_persist.into_iter() {
1668-
debug_assert!(!need_remove.contains(&counterparty_node_id));
1669-
self.persist_peer_state(counterparty_node_id).await?;
1670-
}
1676+
for counterparty_node_id in need_persist.into_iter() {
1677+
debug_assert!(!need_remove.contains(&counterparty_node_id));
1678+
self.persist_peer_state(counterparty_node_id).await?;
1679+
}
16711680

1672-
for counterparty_node_id in need_remove {
1673-
let mut future_opt = None;
1674-
{
1675-
// We need to take the `per_peer_state` write lock to remove an entry, but also
1676-
// have to hold it until after the `remove` call returns (but not through
1677-
// future completion) to ensure that writes for the peer's state are
1678-
// well-ordered with other `persist_peer_state` calls even across the removal
1679-
// itself.
1680-
let mut per_peer_state = self.per_peer_state.write().unwrap();
1681-
if let Entry::Occupied(mut entry) = per_peer_state.entry(counterparty_node_id) {
1682-
let state = entry.get_mut().get_mut().unwrap();
1683-
if state.is_prunable() {
1684-
entry.remove();
1685-
let key = counterparty_node_id.to_string();
1686-
future_opt = Some(self.kv_store.remove(
1687-
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1688-
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1689-
&key,
1690-
));
1681+
for counterparty_node_id in need_remove {
1682+
let mut future_opt = None;
1683+
{
1684+
// We need to take the `per_peer_state` write lock to remove an entry, but also
1685+
// have to hold it until after the `remove` call returns (but not through
1686+
// future completion) to ensure that writes for the peer's state are
1687+
// well-ordered with other `persist_peer_state` calls even across the removal
1688+
// itself.
1689+
let mut per_peer_state = self.per_peer_state.write().unwrap();
1690+
if let Entry::Occupied(mut entry) = per_peer_state.entry(counterparty_node_id) {
1691+
let state = entry.get_mut().get_mut().unwrap();
1692+
if state.is_prunable() {
1693+
entry.remove();
1694+
let key = counterparty_node_id.to_string();
1695+
future_opt = Some(self.kv_store.remove(
1696+
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
1697+
LSPS2_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
1698+
&key,
1699+
));
1700+
} else {
1701+
// If the peer got new state, force a re-persist of the current state.
1702+
state.needs_persist = true;
1703+
}
16911704
} else {
1692-
// If the peer got new state, force a re-persist of the current state.
1693-
state.needs_persist = true;
1705+
// This should never happen, we can only have one `persist` call
1706+
// in-progress at once and map entries are only removed by it.
1707+
debug_assert!(false);
16941708
}
1709+
}
1710+
if let Some(future) = future_opt {
1711+
future.await?;
16951712
} else {
1696-
// This should never happen, we can only have one `persist` call
1697-
// in-progress at once and map entries are only removed by it.
1698-
debug_assert!(false);
1713+
self.persist_peer_state(counterparty_node_id).await?;
16991714
}
17001715
}
1701-
if let Some(future) = future_opt {
1702-
future.await?;
1703-
} else {
1704-
self.persist_peer_state(counterparty_node_id).await?;
1716+
1717+
if self.persistence_in_flight.fetch_sub(1, Ordering::AcqRel) != 1 {
1718+
// If another thread incremented the state while we were running we should go
1719+
// around again, but only once.
1720+
self.persistence_in_flight.store(1, Ordering::Release);
1721+
continue;
17051722
}
1723+
break;
17061724
}
17071725

17081726
Ok(())

lightning-liquidity/src/lsps5/service.rs

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use lightning::util::persist::KVStore;
3636
use lightning::util::ser::Writeable;
3737

3838
use core::ops::Deref;
39+
use core::sync::atomic::{AtomicUsize, Ordering};
3940
use core::time::Duration;
4041

4142
use alloc::string::String;
@@ -140,6 +141,7 @@ where
140141
node_signer: NS,
141142
kv_store: K,
142143
last_pruning: Mutex<Option<LSPSDateTime>>,
144+
persistence_in_flight: AtomicUsize,
143145
}
144146

145147
impl<CM: Deref, NS: Deref, K: Deref + Clone, TP: Deref> LSPS5ServiceHandler<CM, NS, K, TP>
@@ -167,6 +169,7 @@ where
167169
node_signer,
168170
kv_store,
169171
last_pruning: Mutex::new(None),
172+
persistence_in_flight: AtomicUsize::new(0),
170173
}
171174
}
172175

@@ -245,63 +248,80 @@ where
245248
// TODO: We should eventually persist in parallel, however, when we do, we probably want to
246249
// introduce some batching to upper-bound the number of requests inflight at any given
247250
// time.
248-
let mut need_remove = Vec::new();
249-
let mut need_persist = Vec::new();
250-
251-
self.check_prune_stale_webhooks(&mut self.per_peer_state.write().unwrap());
252-
{
253-
let outer_state_lock = self.per_peer_state.read().unwrap();
254-
255-
for (client_id, peer_state) in outer_state_lock.iter() {
256-
let is_prunable = peer_state.is_prunable();
257-
let has_open_channel = self.client_has_open_channel(client_id);
258-
if is_prunable && !has_open_channel {
259-
need_remove.push(*client_id);
260-
} else if peer_state.needs_persist {
261-
need_persist.push(*client_id);
262-
}
263-
}
264-
}
265251

266-
for client_id in need_persist.into_iter() {
267-
debug_assert!(!need_remove.contains(&client_id));
268-
self.persist_peer_state(client_id).await?;
252+
if self.persistence_in_flight.fetch_add(1, Ordering::AcqRel) > 0 {
253+
// If we're not the first event processor to get here, just return early, the increment
254+
// we just did will be treated as "go around again" at the end.
255+
return Ok(());
269256
}
270257

271-
for client_id in need_remove {
272-
let mut future_opt = None;
258+
loop {
259+
let mut need_remove = Vec::new();
260+
let mut need_persist = Vec::new();
261+
262+
self.check_prune_stale_webhooks(&mut self.per_peer_state.write().unwrap());
273263
{
274-
// We need to take the `per_peer_state` write lock to remove an entry, but also
275-
// have to hold it until after the `remove` call returns (but not through
276-
// future completion) to ensure that writes for the peer's state are
277-
// well-ordered with other `persist_peer_state` calls even across the removal
278-
// itself.
279-
let mut per_peer_state = self.per_peer_state.write().unwrap();
280-
if let Entry::Occupied(mut entry) = per_peer_state.entry(client_id) {
281-
let state = entry.get_mut();
282-
if state.is_prunable() && !self.client_has_open_channel(&client_id) {
283-
entry.remove();
284-
let key = client_id.to_string();
285-
future_opt = Some(self.kv_store.remove(
286-
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
287-
LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
288-
&key,
289-
));
264+
let outer_state_lock = self.per_peer_state.read().unwrap();
265+
266+
for (client_id, peer_state) in outer_state_lock.iter() {
267+
let is_prunable = peer_state.is_prunable();
268+
let has_open_channel = self.client_has_open_channel(client_id);
269+
if is_prunable && !has_open_channel {
270+
need_remove.push(*client_id);
271+
} else if peer_state.needs_persist {
272+
need_persist.push(*client_id);
273+
}
274+
}
275+
}
276+
277+
for client_id in need_persist.into_iter() {
278+
debug_assert!(!need_remove.contains(&client_id));
279+
self.persist_peer_state(client_id).await?;
280+
}
281+
282+
for client_id in need_remove {
283+
let mut future_opt = None;
284+
{
285+
// We need to take the `per_peer_state` write lock to remove an entry, but also
286+
// have to hold it until after the `remove` call returns (but not through
287+
// future completion) to ensure that writes for the peer's state are
288+
// well-ordered with other `persist_peer_state` calls even across the removal
289+
// itself.
290+
let mut per_peer_state = self.per_peer_state.write().unwrap();
291+
if let Entry::Occupied(mut entry) = per_peer_state.entry(client_id) {
292+
let state = entry.get_mut();
293+
if state.is_prunable() && !self.client_has_open_channel(&client_id) {
294+
entry.remove();
295+
let key = client_id.to_string();
296+
future_opt = Some(self.kv_store.remove(
297+
LIQUIDITY_MANAGER_PERSISTENCE_PRIMARY_NAMESPACE,
298+
LSPS5_SERVICE_PERSISTENCE_SECONDARY_NAMESPACE,
299+
&key,
300+
));
301+
} else {
302+
// If the peer was re-added, force a re-persist of the current state.
303+
state.needs_persist = true;
304+
}
290305
} else {
291-
// If the peer was re-added, force a re-persist of the current state.
292-
state.needs_persist = true;
306+
// This should never happen, we can only have one `persist` call
307+
// in-progress at once and map entries are only removed by it.
308+
debug_assert!(false);
293309
}
310+
}
311+
if let Some(future) = future_opt {
312+
future.await?;
294313
} else {
295-
// This should never happen, we can only have one `persist` call
296-
// in-progress at once and map entries are only removed by it.
297-
debug_assert!(false);
314+
self.persist_peer_state(client_id).await?;
298315
}
299316
}
300-
if let Some(future) = future_opt {
301-
future.await?;
302-
} else {
303-
self.persist_peer_state(client_id).await?;
317+
318+
if self.persistence_in_flight.fetch_sub(1, Ordering::AcqRel) != 1 {
319+
// If another thread incremented the state while we were running we should go
320+
// around again, but only once.
321+
self.persistence_in_flight.store(1, Ordering::Release);
322+
continue;
304323
}
324+
break;
305325
}
306326

307327
Ok(())

0 commit comments

Comments
 (0)