From 30f408118213871503cdb7c3bbf229bc04a15a38 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Fri, 20 Mar 2026 14:40:12 +0530 Subject: [PATCH 01/11] HDDS-14868. Avoid full scan of container list during refreshAndValidate of ContainerSafemodeRule. --- .../AbstractContainerSafeModeRule.java | 84 ++++++++++++++++++- .../scm/safemode/ECContainerSafeModeRule.java | 24 ++++++ .../safemode/RatisContainerSafeModeRule.java | 17 ++++ 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 09480009455d..c60375db28f4 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -50,6 +50,8 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule openContainers = new ConcurrentHashMap<>(); + private final Map closedContainers = new ConcurrentHashMap<>(); public AbstractContainerSafeModeRule(ConfigurationSource conf, SCMSafeModeManager safeModeManager, ContainerManager containerManager, EventQueue eventQueue) { @@ -59,6 +61,18 @@ public AbstractContainerSafeModeRule(ConfigurationSource conf, SCMSafeModeManage initializeRule(); } + public ContainerManager getContainerManager() { + return containerManager; + } + + public Map getClosedContainers() { + return closedContainers; + } + + public Map getOpenContainers() { + return openContainers; + } + protected abstract ReplicationType getContainerType(); protected abstract void handleReportedContainer(ContainerID containerID, DatanodeID datanodeID); @@ -73,10 +87,20 @@ protected final void incrementContainersWithMinReplicas() { protected void initializeRule() { containers.clear(); + openContainers.clear(); + closedContainers.clear(); containerManager.getContainers(getContainerType()).stream() - .filter(this::isClosed) .filter(c -> c.getNumberOfKeys() > 0) - .forEach(c -> containers.put(c.containerID(), c.getReplicationConfig().getMinimumNodes())); + .forEach(c -> { + if (isClosed(c)) { + containers.put(c.containerID(), c.getReplicationConfig().getMinimumNodes()); + closedContainers.put(c.containerID(), c.containerID()); + } + if (isOpen(c)) { + openContainers.put(c.containerID(),c.containerID()); + } + } + ); totalContainers.set(containers.size()); final long cutOff = (long) Math.ceil(getTotalNumberOfContainers() * getSafeModeCutoff()); getSafeModeMetrics().setNumContainerReportedThreshold(getContainerType(), cutOff); @@ -91,6 +115,18 @@ protected int getTotalNumberOfContainers() { return totalContainers.get(); } + protected void addContainer(ContainerInfo containerInfo) { + if (containers.putIfAbsent(containerInfo.containerID(), containerInfo.getReplicationConfig().getMinimumNodes()) == null) { + totalContainers.getAndIncrement(); + } + } + + protected void removeContainer(ContainerInfo containerInfo) { + if (containers.remove(containerInfo.containerID()) != null) { + totalContainers.getAndDecrement(); + } + } + protected double getSafeModeCutoff() { return safeModeCutoff; } @@ -135,14 +171,42 @@ public double getCurrentContainerThreshold() { @Override public synchronized void refresh(boolean forceRefresh) { - if (forceRefresh || !validate()) { + if (forceRefresh) { initializeRule(); } + if (!validate()) { + // iterate through open containers and check if any of them have moved to closed state + for(ContainerID containerID : openContainers.keySet()) { + try { + ContainerInfo containerInfo = containerManager.getContainer(containerID); + if (isClosed(containerInfo)) { + addContainer(containerInfo); + openContainers.remove(containerID); + } } catch (ContainerNotFoundException e) { + // log + } + } + // iterate through closed containers and check if any of them have moved to deleted state + for(ContainerID containerID : closedContainers.keySet()) { + try { + ContainerInfo containerInfo = containerManager.getContainer(containerID); + if (isDeleted(containerInfo)) { + removeContainer(containerInfo); + closedContainers.remove(containerID); + }} catch (ContainerNotFoundException e) { + // log + } + } + } } + @Override protected void cleanup() { - getContainers().clear(); + if (containers != null) containers.clear(); + if (openContainers != null) openContainers.clear(); + if (closedContainers != null) closedContainers.clear(); + if (totalContainers != null) totalContainers.set(0); } /** @@ -167,6 +231,18 @@ protected boolean isClosed(ContainerInfo container) { return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; } + protected boolean isOpen(ContainerInfo container) { + final LifeCycleState state = container.getState(); + // should we include CLOSING? + return state == LifeCycleState.OPEN; + } + + private boolean isDeleted(ContainerInfo container) { + final LifeCycleState state = container.getState(); + // should we include DELETING ? + return state == LifeCycleState.DELETED; + } + protected int getMinReplica(ContainerID id) { return containers.getOrDefault(id, 0); } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index 9eec2e3df2a1..b80c5db68137 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -23,7 +23,9 @@ import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.server.events.EventQueue; /** @@ -59,6 +61,28 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); } + } else { + // we received a container report that SCM was unaware of when it initialized + // check if the container state is closed/quasi-closed and if yes count it + try { + ContainerInfo containerInfo = getContainerManager().getContainer(containerID); + if (isClosed(containerInfo)) { + addContainer(containerInfo); + getClosedContainers().put(containerID,containerID); + getOpenContainers().remove(containerID); + final Map replicas = + ecContainerDNsMap.computeIfAbsent(containerID, key -> new ConcurrentHashMap<>()); + replicas.put(datanodeID, datanodeID); + + if (replicas.size() >= getMinReplica(containerID)) { + getContainers().remove(containerID); + incrementContainersWithMinReplicas(); + getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); + } + } + } catch (ContainerNotFoundException cnfe) { + // log it + } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index 61ade355eabd..d26d849bf68e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -21,7 +21,9 @@ import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.ratis.util.Preconditions; @@ -52,6 +54,21 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan Preconditions.assertSame(1, minReplica, "minReplica"); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); + } else { + // we received a container report that SCM was unaware of when it initialized + // check if the container state is closed/quasi-closed and if yes count it + try { + ContainerInfo containerInfo = getContainerManager().getContainer(containerID); + if (isClosed(containerInfo)) { + addContainer(containerInfo); + incrementContainersWithMinReplicas(); + getClosedContainers().put(containerID, containerID); + getOpenContainers().remove(containerID); + getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); + } + } catch (ContainerNotFoundException cnfe) { + // log it + } } } From 30290d614b40942dabb8dcf2446ca491a7a7cf80 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Mon, 23 Mar 2026 13:59:13 +0530 Subject: [PATCH 02/11] fix 1 --- .../AbstractContainerSafeModeRule.java | 20 +++++++++++++++++-- .../scm/safemode/ECContainerSafeModeRule.java | 10 +++++++--- .../safemode/RatisContainerSafeModeRule.java | 12 +++++++---- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index c60375db28f4..8cf4ab6e9d55 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -52,6 +52,7 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule openContainers = new ConcurrentHashMap<>(); private final Map closedContainers = new ConcurrentHashMap<>(); + private final Map processedContainers = new ConcurrentHashMap<>(); public AbstractContainerSafeModeRule(ConfigurationSource conf, SCMSafeModeManager safeModeManager, ContainerManager containerManager, EventQueue eventQueue) { @@ -73,6 +74,14 @@ public Map getOpenContainers() { return openContainers; } + public Map getProcessedContainers() { + return processedContainers; + } + + void incrementTotalContainers() { + totalContainers.getAndIncrement(); + } + protected abstract ReplicationType getContainerType(); protected abstract void handleReportedContainer(ContainerID containerID, DatanodeID datanodeID); @@ -89,6 +98,7 @@ protected void initializeRule() { containers.clear(); openContainers.clear(); closedContainers.clear(); + processedContainers.clear(); containerManager.getContainers(getContainerType()).stream() .filter(c -> c.getNumberOfKeys() > 0) .forEach(c -> { @@ -182,8 +192,11 @@ public synchronized void refresh(boolean forceRefresh) { if (isClosed(containerInfo)) { addContainer(containerInfo); openContainers.remove(containerID); + closedContainers.put(containerID, containerID); } } catch (ContainerNotFoundException e) { - // log + SCMSafeModeManager.getLogger().debug( + "Container {} not found while checking open-to-closed transition, may be transient", + containerID); } } // iterate through closed containers and check if any of them have moved to deleted state @@ -194,7 +207,9 @@ public synchronized void refresh(boolean forceRefresh) { removeContainer(containerInfo); closedContainers.remove(containerID); }} catch (ContainerNotFoundException e) { - // log + SCMSafeModeManager.getLogger().debug( + "Container {} not found while checking closed-to-deleted transition, may be transient", + containerID); } } } @@ -207,6 +222,7 @@ protected void cleanup() { if (openContainers != null) openContainers.clear(); if (closedContainers != null) closedContainers.clear(); if (totalContainers != null) totalContainers.set(0); + if (processedContainers != null) processedContainers.clear(); } /** diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index b80c5db68137..b29953a53498 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -58,15 +58,16 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan if (replicas.size() >= getMinReplica(containerID)) { getContainers().remove(containerID); + getProcessedContainers().put(containerID, containerID); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); } - } else { + } else if (!getProcessedContainers().containsKey(containerID)){ // we received a container report that SCM was unaware of when it initialized // check if the container state is closed/quasi-closed and if yes count it try { ContainerInfo containerInfo = getContainerManager().getContainer(containerID); - if (isClosed(containerInfo)) { + if (isClosed(containerInfo) && containerInfo.getNumberOfKeys() > 0) { addContainer(containerInfo); getClosedContainers().put(containerID,containerID); getOpenContainers().remove(containerID); @@ -78,10 +79,13 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan getContainers().remove(containerID); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); + getProcessedContainers().put(containerID, containerID); } } } catch (ContainerNotFoundException cnfe) { - // log it + SCMSafeModeManager.getLogger().debug( + "Container {} not found in ContainerManager : {}", + containerID, cnfe.getMessage()); } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index d26d849bf68e..7d56bd1e6098 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -54,20 +54,24 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan Preconditions.assertSame(1, minReplica, "minReplica"); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); - } else { + getProcessedContainers().put(containerID, containerID); + } else if (!getProcessedContainers().containsKey(containerID)) { // we received a container report that SCM was unaware of when it initialized // check if the container state is closed/quasi-closed and if yes count it try { ContainerInfo containerInfo = getContainerManager().getContainer(containerID); - if (isClosed(containerInfo)) { - addContainer(containerInfo); + if (isClosed(containerInfo) && containerInfo.getNumberOfKeys() > 0) { + incrementTotalContainers(); incrementContainersWithMinReplicas(); + getProcessedContainers().put(containerID, containerID); getClosedContainers().put(containerID, containerID); getOpenContainers().remove(containerID); getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); } } catch (ContainerNotFoundException cnfe) { - // log it + SCMSafeModeManager.getLogger().debug( + "Container {} not found in ContainerManager : {}", + containerID, cnfe.getMessage()); } } } From 2a9aaddad25ba31890c1e2d9f54896e9fdec0107 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Fri, 27 Mar 2026 13:22:43 +0530 Subject: [PATCH 03/11] refresh every 5s --- .../apache/hadoop/hdds/HddsConfigKeys.java | 9 ++ .../AbstractContainerSafeModeRule.java | 95 ++++++++++++++++++- .../hdds/scm/safemode/SCMSafeModeManager.java | 7 ++ .../AbstractContainerSafeModeRuleTest.java | 38 +++++++- .../scm/safemode/TestSCMSafeModeManager.java | 1 + .../scm/safemode/TestSafeModeRuleFactory.java | 5 +- 6 files changed, 148 insertions(+), 7 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 0473d6da36ab..3dd52deb1450 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -116,6 +116,15 @@ public final class HddsConfigKeys { "hdds.scm.safemode.log.interval"; public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m"; + /** + * Interval for background sync of open/closed container sets with ContainerManager + * in {@code AbstractContainerSafeModeRule}. 0 disables the background thread. + */ + public static final String HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL = + "hdds.scm.safemode.container.rule.incremental.sync.interval"; + public static final String + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT = "5s"; + // This configuration setting is used as a fallback location by all // Ozone/HDDS services for their metadata. It is useful as a single // config point for test/PoC clusters. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 8cf4ab6e9d55..f42d4944ff3c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -17,14 +17,20 @@ package org.apache.hadoop.hdds.scm.safemode; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.hadoop.hdds.conf.ConfigurationSource; @@ -54,12 +60,19 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule closedContainers = new ConcurrentHashMap<>(); private final Map processedContainers = new ConcurrentHashMap<>(); + private final long refreshInterval; + private volatile ScheduledExecutorService refreshExecutor; public AbstractContainerSafeModeRule(ConfigurationSource conf, SCMSafeModeManager safeModeManager, ContainerManager containerManager, EventQueue eventQueue) { super(safeModeManager, eventQueue); this.containerManager = containerManager; this.safeModeCutoff = getSafeModeCutoff(conf); + this.refreshInterval = conf.getTimeDuration( + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS); initializeRule(); + startRefreshExecutor(); } public ContainerManager getContainerManager() { @@ -179,11 +192,53 @@ public double getCurrentContainerThreshold() { return total == 0 ? 1 : ((double) getNumberOfContainersWithMinReplica() / total); } + + + private void startRefreshExecutor() { + if (refreshInterval <= 0) { + SCMSafeModeManager.getLogger().info( + "Container safe mode rule incremental sync is disabled ({}=0).", + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL); + return; + } + refreshExecutor = Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setDaemon(true) + .setNameFormat( + "ContainerSafeModeRule-" + getContainerType() + "-refresh-%d") + .build()); + refreshExecutor.scheduleAtFixedRate( + this::runRefresh, + refreshInterval, + refreshInterval, + TimeUnit.MILLISECONDS); + } + + /** + * Background task: reconcile open/closed container tracking with + * {@link ContainerManager} while SCM is in safe mode. + */ + private void runRefresh() { + synchronized (this) { + if (!scmInSafeMode()) { + return; + } + refreshExpectedContainers(); + } + } + @Override public synchronized void refresh(boolean forceRefresh) { if (forceRefresh) { initializeRule(); } + } + + /** + * Aligns tracked open/closed sets with current {@link ContainerManager} state. + * Runs on the incremental sync thread when configured. + */ + private void refreshExpectedContainers() { if (!validate()) { // iterate through open containers and check if any of them have moved to closed state for(ContainerID containerID : openContainers.keySet()) { @@ -218,11 +273,41 @@ public synchronized void refresh(boolean forceRefresh) { @Override protected void cleanup() { - if (containers != null) containers.clear(); - if (openContainers != null) openContainers.clear(); - if (closedContainers != null) closedContainers.clear(); - if (totalContainers != null) totalContainers.set(0); - if (processedContainers != null) processedContainers.clear(); + stopIncrementalSyncExecutor(); + synchronized (this) { + if (containers != null) { + containers.clear(); + } + if (openContainers != null) { + openContainers.clear(); + } + if (closedContainers != null) { + closedContainers.clear(); + } + if (totalContainers != null) { + totalContainers.set(0); + } + if (processedContainers != null) { + processedContainers.clear(); + } + } + } + + private void stopIncrementalSyncExecutor() { + if (refreshExecutor != null) { + refreshExecutor.shutdownNow(); + try { + refreshExecutor.awaitTermination(5, TimeUnit.SECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + refreshExecutor = null; + } + } + + @VisibleForTesting + void runIncrementalContainerSyncForTesting() { + runRefresh(); } /** diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 2c9173b2bf09..324944c3e99e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -183,6 +183,13 @@ public synchronized void validateSafeModeExitRules(String ruleName) { public void forceExitSafeMode() { LOG.info("SCM force-exiting safe mode."); status.set(SafeModeStatus.OUT_OF_SAFE_MODE); + exitRules.values().forEach(rule -> { + try { + rule.cleanup(); + } catch (Exception e) { + LOG.warn("Safe mode exit rule cleanup failed for {}", rule.getRuleName(), e); + } + }); emitSafeModeStatus(); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java index 7bfdecc71964..93cc17aa2fd9 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java @@ -17,6 +17,8 @@ package org.apache.hadoop.hdds.scm.safemode; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -27,6 +29,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.DatanodeID; @@ -51,16 +54,21 @@ public abstract class AbstractContainerSafeModeRuleTest { private List containers; private AbstractContainerSafeModeRule rule; + private SCMSafeModeManager safeModeManager; @BeforeEach public void setup() throws ContainerNotFoundException { final ContainerManager containerManager = mock(ContainerManager.class); final ConfigurationSource conf = mock(ConfigurationSource.class); final EventQueue eventQueue = mock(EventQueue.class); - final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class); + safeModeManager = mock(SCMSafeModeManager.class); final SafeModeMetrics metrics = mock(SafeModeMetrics.class); when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics); + when(conf.getTimeDuration( + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, + HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS)).thenReturn(0L); containers = new ArrayList<>(); when(containerManager.getContainers(getReplicationType())).thenReturn(containers); when(containerManager.getContainer(any(ContainerID.class))).thenAnswer(invocation -> { @@ -170,6 +178,34 @@ public void testDuplicateContainerIdsInReports() { assertEquals(1.0, rule.getCurrentContainerThreshold(), "Duplicated containers should be counted only once"); } + /** + * When {@code validate()} is false, background refresh reconciles tracked open + * containers with {@link ContainerManager}: an OPEN container that becomes CLOSED is + * moved into closed tracking. + */ + @Test + public void testIncrementalRefreshPromotesOpenContainerToClosedInTracking() { + when(safeModeManager.getInSafeMode()).thenReturn(true); + + ContainerInfo closedMissingReplicas = mockContainer(LifeCycleState.CLOSED, 1L); + ContainerInfo openThenClosed = mockContainer(LifeCycleState.OPEN, 2L); + containers.add(closedMissingReplicas); + containers.add(openThenClosed); + rule.refresh(true); + + ContainerID id2 = ContainerID.valueOf(2L); + assertTrue(rule.getOpenContainers().containsKey(id2), + "OPEN container should be tracked in openContainers after initializeRule"); + + when(openThenClosed.getState()).thenReturn(LifeCycleState.CLOSED); + + rule.runIncrementalContainerSyncForTesting(); + + assertFalse(rule.getOpenContainers().containsKey(id2)); + assertTrue(rule.getClosedContainers().containsKey(id2)); + assertTrue(rule.getContainers().containsKey(id2)); + } + @Test public void testValidateBasedOnReportProcessingTrue() { rule.setValidateBasedOnReportProcessing(true); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index fdf38a7a67ca..05bc7396402a 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -108,6 +108,7 @@ public void setUp() throws IOException { false); config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath()); config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1); + config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, "0s"); scmMetadataStore = new SCMMetadataStoreImpl(config); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java index f795a6c57628..5f3dffa2ce69 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java @@ -23,6 +23,7 @@ import static org.mockito.Mockito.when; import java.lang.reflect.Field; +import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.ha.SCMContext; @@ -79,7 +80,9 @@ public void testLoadedPreCheckRules() { private SCMSafeModeManager initializeSafeModeRuleFactory() { final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class); when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class)); - SafeModeRuleFactory.initialize(new OzoneConfiguration(), + OzoneConfiguration conf = new OzoneConfiguration(); + conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, "0s"); + SafeModeRuleFactory.initialize(conf, SCMContext.emptyContext(), new EventQueue(), mock( PipelineManager.class), mock(ContainerManager.class), mock(NodeManager.class)); From d74022e3f529eecf2cc87e8e9ce78148fff6110e Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Mon, 30 Mar 2026 14:48:27 +0530 Subject: [PATCH 04/11] refresh only when transactions pending --- .../AbstractContainerSafeModeRule.java | 3 ++ .../hdds/scm/safemode/SCMSafeModeManager.java | 35 +++++++++++++++++++ .../hdds/scm/safemode/SafeModeExitRule.java | 4 +++ 3 files changed, 42 insertions(+) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index f42d4944ff3c..aa7cd94c160b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -239,6 +239,9 @@ public synchronized void refresh(boolean forceRefresh) { * Runs on the incremental sync thread when configured. */ private void refreshExpectedContainers() { + if (getSafeModeManager().isScmRatisApplyCaughtUpToCommit()) { + return; + } if (!validate()) { // iterate through open containers and check if any of them have moved to closed state for(ContainerID containerID : openContainers.keySet()) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 324944c3e99e..4f1264bb0fdf 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -34,11 +34,15 @@ import org.apache.hadoop.hdds.HddsConfigKeys; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.ha.SCMHAManager; +import org.apache.hadoop.hdds.scm.ha.SCMRatisServer; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.scm.ha.SCMService.Event; import org.apache.hadoop.hdds.scm.ha.SCMServiceManager; import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; +import org.apache.hadoop.hdds.scm.server.OzoneStorageContainerManager; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.hdds.server.events.EventQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -133,6 +137,37 @@ public SafeModeMetrics getSafeModeMetrics() { return safeModeMetrics; } + /** + * Returns true when the SCM Ratis state machine has applied all committed log entries + * ({@code lastAppliedIndex >= lastCommittedIndex}). In that case incremental + * {@link AbstractContainerSafeModeRule} refresh may skip work. + *

+ * Returns false if SCM is not {@link StorageContainerManager}, HA/Ratis is unavailable, + * or the comparison cannot be made. + */ + public boolean isScmRatisApplyCaughtUpToCommit() { + try { + OzoneStorageContainerManager ozoneScm = scmContext.getScm(); + if (!(ozoneScm instanceof StorageContainerManager)) { + return false; + } + SCMHAManager ha = ((StorageContainerManager) ozoneScm).getScmHAManager(); + if (ha == null) { + return false; + } + SCMRatisServer ratis = ha.getRatisServer(); + if (ratis == null) { + return false; + } + long applied = ratis.getSCMStateMachine().getLastAppliedTermIndex().getIndex(); + long committed = ratis.getDivision().getRaftLog().getLastCommittedIndex(); + return applied >= committed; + } catch (Exception e) { + LOG.debug("Could not compare Ratis last applied vs last committed index", e); + return false; + } + } + private void emitSafeModeStatus() { final SafeModeStatus safeModeStatus = status.get(); safeModeMetrics.setScmInSafeMode(safeModeStatus.isInSafeMode()); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java index 8535fbdf15a9..e28ca90ffe1a 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java @@ -68,6 +68,10 @@ public String getRuleName() { return ruleName; } + protected final SCMSafeModeManager getSafeModeManager() { + return safeModeManager; + } + /** * Return's the event type this safeMode exit rule handles. * @return TypedEvent From 66163f153929031afc2214d4f508c54f6013c0ae Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Mon, 30 Mar 2026 14:55:35 +0530 Subject: [PATCH 05/11] revert new map additions --- .../AbstractContainerSafeModeRule.java | 93 +------------------ .../scm/safemode/ECContainerSafeModeRule.java | 28 ------ .../safemode/RatisContainerSafeModeRule.java | 21 ----- 3 files changed, 2 insertions(+), 140 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index aa7cd94c160b..98bf77248774 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -56,9 +56,6 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule openContainers = new ConcurrentHashMap<>(); - private final Map closedContainers = new ConcurrentHashMap<>(); - private final Map processedContainers = new ConcurrentHashMap<>(); private final long refreshInterval; private volatile ScheduledExecutorService refreshExecutor; @@ -79,18 +76,6 @@ public ContainerManager getContainerManager() { return containerManager; } - public Map getClosedContainers() { - return closedContainers; - } - - public Map getOpenContainers() { - return openContainers; - } - - public Map getProcessedContainers() { - return processedContainers; - } - void incrementTotalContainers() { totalContainers.getAndIncrement(); } @@ -109,21 +94,9 @@ protected final void incrementContainersWithMinReplicas() { protected void initializeRule() { containers.clear(); - openContainers.clear(); - closedContainers.clear(); - processedContainers.clear(); containerManager.getContainers(getContainerType()).stream() .filter(c -> c.getNumberOfKeys() > 0) - .forEach(c -> { - if (isClosed(c)) { - containers.put(c.containerID(), c.getReplicationConfig().getMinimumNodes()); - closedContainers.put(c.containerID(), c.containerID()); - } - if (isOpen(c)) { - openContainers.put(c.containerID(),c.containerID()); - } - } - ); + .forEach(c -> containers.put(c.containerID(), c.getReplicationConfig().getMinimumNodes())); totalContainers.set(containers.size()); final long cutOff = (long) Math.ceil(getTotalNumberOfContainers() * getSafeModeCutoff()); getSafeModeMetrics().setNumContainerReportedThreshold(getContainerType(), cutOff); @@ -138,18 +111,6 @@ protected int getTotalNumberOfContainers() { return totalContainers.get(); } - protected void addContainer(ContainerInfo containerInfo) { - if (containers.putIfAbsent(containerInfo.containerID(), containerInfo.getReplicationConfig().getMinimumNodes()) == null) { - totalContainers.getAndIncrement(); - } - } - - protected void removeContainer(ContainerInfo containerInfo) { - if (containers.remove(containerInfo.containerID()) != null) { - totalContainers.getAndDecrement(); - } - } - protected double getSafeModeCutoff() { return safeModeCutoff; } @@ -243,33 +204,7 @@ private void refreshExpectedContainers() { return; } if (!validate()) { - // iterate through open containers and check if any of them have moved to closed state - for(ContainerID containerID : openContainers.keySet()) { - try { - ContainerInfo containerInfo = containerManager.getContainer(containerID); - if (isClosed(containerInfo)) { - addContainer(containerInfo); - openContainers.remove(containerID); - closedContainers.put(containerID, containerID); - } } catch (ContainerNotFoundException e) { - SCMSafeModeManager.getLogger().debug( - "Container {} not found while checking open-to-closed transition, may be transient", - containerID); - } - } - // iterate through closed containers and check if any of them have moved to deleted state - for(ContainerID containerID : closedContainers.keySet()) { - try { - ContainerInfo containerInfo = containerManager.getContainer(containerID); - if (isDeleted(containerInfo)) { - removeContainer(containerInfo); - closedContainers.remove(containerID); - }} catch (ContainerNotFoundException e) { - SCMSafeModeManager.getLogger().debug( - "Container {} not found while checking closed-to-deleted transition, may be transient", - containerID); - } - } + initializeRule(); } } @@ -281,18 +216,6 @@ protected void cleanup() { if (containers != null) { containers.clear(); } - if (openContainers != null) { - openContainers.clear(); - } - if (closedContainers != null) { - closedContainers.clear(); - } - if (totalContainers != null) { - totalContainers.set(0); - } - if (processedContainers != null) { - processedContainers.clear(); - } } } @@ -335,18 +258,6 @@ protected boolean isClosed(ContainerInfo container) { return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; } - protected boolean isOpen(ContainerInfo container) { - final LifeCycleState state = container.getState(); - // should we include CLOSING? - return state == LifeCycleState.OPEN; - } - - private boolean isDeleted(ContainerInfo container) { - final LifeCycleState state = container.getState(); - // should we include DELETING ? - return state == LifeCycleState.DELETED; - } - protected int getMinReplica(ContainerID id) { return containers.getOrDefault(id, 0); } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index b29953a53498..9eec2e3df2a1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -23,9 +23,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.server.events.EventQueue; /** @@ -58,35 +56,9 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan if (replicas.size() >= getMinReplica(containerID)) { getContainers().remove(containerID); - getProcessedContainers().put(containerID, containerID); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); } - } else if (!getProcessedContainers().containsKey(containerID)){ - // we received a container report that SCM was unaware of when it initialized - // check if the container state is closed/quasi-closed and if yes count it - try { - ContainerInfo containerInfo = getContainerManager().getContainer(containerID); - if (isClosed(containerInfo) && containerInfo.getNumberOfKeys() > 0) { - addContainer(containerInfo); - getClosedContainers().put(containerID,containerID); - getOpenContainers().remove(containerID); - final Map replicas = - ecContainerDNsMap.computeIfAbsent(containerID, key -> new ConcurrentHashMap<>()); - replicas.put(datanodeID, datanodeID); - - if (replicas.size() >= getMinReplica(containerID)) { - getContainers().remove(containerID); - incrementContainersWithMinReplicas(); - getSafeModeMetrics().incCurrentContainersWithECDataReplicaReportedCount(); - getProcessedContainers().put(containerID, containerID); - } - } - } catch (ContainerNotFoundException cnfe) { - SCMSafeModeManager.getLogger().debug( - "Container {} not found in ContainerManager : {}", - containerID, cnfe.getMessage()); - } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index 7d56bd1e6098..61ade355eabd 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -21,9 +21,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeID; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.server.events.EventQueue; import org.apache.ratis.util.Preconditions; @@ -54,25 +52,6 @@ protected void handleReportedContainer(ContainerID containerID, DatanodeID datan Preconditions.assertSame(1, minReplica, "minReplica"); incrementContainersWithMinReplicas(); getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); - getProcessedContainers().put(containerID, containerID); - } else if (!getProcessedContainers().containsKey(containerID)) { - // we received a container report that SCM was unaware of when it initialized - // check if the container state is closed/quasi-closed and if yes count it - try { - ContainerInfo containerInfo = getContainerManager().getContainer(containerID); - if (isClosed(containerInfo) && containerInfo.getNumberOfKeys() > 0) { - incrementTotalContainers(); - incrementContainersWithMinReplicas(); - getProcessedContainers().put(containerID, containerID); - getClosedContainers().put(containerID, containerID); - getOpenContainers().remove(containerID); - getSafeModeMetrics().incCurrentContainersWithOneReplicaReportedCount(); - } - } catch (ContainerNotFoundException cnfe) { - SCMSafeModeManager.getLogger().debug( - "Container {} not found in ContainerManager : {}", - containerID, cnfe.getMessage()); - } } } From efdefc94ea9e113b0f0a548a41afc57064ee0fed Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Mon, 30 Mar 2026 14:58:29 +0530 Subject: [PATCH 06/11] compile --- .../AbstractContainerSafeModeRuleTest.java | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java index 93cc17aa2fd9..75d2f67708a0 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java @@ -178,34 +178,6 @@ public void testDuplicateContainerIdsInReports() { assertEquals(1.0, rule.getCurrentContainerThreshold(), "Duplicated containers should be counted only once"); } - /** - * When {@code validate()} is false, background refresh reconciles tracked open - * containers with {@link ContainerManager}: an OPEN container that becomes CLOSED is - * moved into closed tracking. - */ - @Test - public void testIncrementalRefreshPromotesOpenContainerToClosedInTracking() { - when(safeModeManager.getInSafeMode()).thenReturn(true); - - ContainerInfo closedMissingReplicas = mockContainer(LifeCycleState.CLOSED, 1L); - ContainerInfo openThenClosed = mockContainer(LifeCycleState.OPEN, 2L); - containers.add(closedMissingReplicas); - containers.add(openThenClosed); - rule.refresh(true); - - ContainerID id2 = ContainerID.valueOf(2L); - assertTrue(rule.getOpenContainers().containsKey(id2), - "OPEN container should be tracked in openContainers after initializeRule"); - - when(openThenClosed.getState()).thenReturn(LifeCycleState.CLOSED); - - rule.runIncrementalContainerSyncForTesting(); - - assertFalse(rule.getOpenContainers().containsKey(id2)); - assertTrue(rule.getClosedContainers().containsKey(id2)); - assertTrue(rule.getContainers().containsKey(id2)); - } - @Test public void testValidateBasedOnReportProcessingTrue() { rule.setValidateBasedOnReportProcessing(true); From 34603b87fcdf0b117a4a2c70cdd993963fdacd28 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Tue, 31 Mar 2026 18:23:24 +0530 Subject: [PATCH 07/11] address comments --- .../apache/hadoop/hdds/HddsConfigKeys.java | 4 +- .../AbstractContainerSafeModeRule.java | 77 +------------------ .../hdds/scm/safemode/SCMSafeModeManager.java | 37 +++++++++ .../AbstractContainerSafeModeRuleTest.java | 8 +- .../scm/safemode/TestSCMSafeModeManager.java | 2 +- .../scm/safemode/TestSafeModeRuleFactory.java | 2 +- 6 files changed, 47 insertions(+), 83 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 3dd52deb1450..a6fabbe29186 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -120,10 +120,10 @@ public final class HddsConfigKeys { * Interval for background sync of open/closed container sets with ContainerManager * in {@code AbstractContainerSafeModeRule}. 0 disables the background thread. */ - public static final String HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL = + public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL = "hdds.scm.safemode.container.rule.incremental.sync.interval"; public static final String - HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT = "5s"; + HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s"; // This configuration setting is used as a fallback location by all // Ozone/HDDS services for their metadata. It is useful as a single diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 98bf77248774..759d6beebd85 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -17,20 +17,14 @@ package org.apache.hadoop.hdds.scm.safemode; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; -import com.google.common.util.concurrent.ThreadFactoryBuilder; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.hadoop.hdds.conf.ConfigurationSource; @@ -56,28 +50,12 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule safeModeLogTask; + private final long refreshIntervalMs; public SCMSafeModeManager(final ConfigurationSource conf, final NodeManager nodeManager, @@ -121,6 +124,31 @@ public SCMSafeModeManager(final ConfigurationSource conf, status.set(SafeModeStatus.OUT_OF_SAFE_MODE); emitSafeModeStatus(); } + + this.refreshIntervalMs = conf.getTimeDuration( + HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, + HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT, + TimeUnit.MILLISECONDS); + startRefreshExecutor(refreshIntervalMs); + } + + private void startRefreshExecutor(long refreshIntervalMs) { + final boolean enabled = refreshIntervalMs > 0; + LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms", + enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMs); + if (!enabled) { + return; + } + final ScheduledExecutorService refreshExecutor = Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setDaemon(true) + .setNameFormat(getClass().getSimpleName() + "-refresh-%d") + .build()); + refreshExecutor.scheduleAtFixedRate( + () -> refreshAndValidate(refreshExecutor), + refreshIntervalMs, + refreshIntervalMs, + TimeUnit.MILLISECONDS); } public void start() { @@ -246,6 +274,13 @@ public void refresh() { * Refresh Rule state and validate rules. */ public void refreshAndValidate() { + if (refreshIntervalMs > 0) { + return; // use executor to refresh + } + refreshAndValidate(null); + } + + private void refreshAndValidate(ScheduledExecutorService refreshExecutor) { if (getInSafeMode()) { exitRules.values().forEach(rule -> { rule.refresh(false); @@ -254,6 +289,8 @@ public void refreshAndValidate() { rule.cleanup(); } }); + } else if (refreshExecutor != null) { + refreshExecutor.shutdownNow(); // Not in safemode } } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java index 75d2f67708a0..70ecf075d186 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java @@ -17,8 +17,8 @@ package org.apache.hadoop.hdds.scm.safemode; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -66,8 +66,8 @@ public void setup() throws ContainerNotFoundException { when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics); when(conf.getTimeDuration( - HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, - HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL_DEFAULT, + HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, + HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS)).thenReturn(0L); containers = new ArrayList<>(); when(containerManager.getContainers(getReplicationType())).thenReturn(containers); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index 05bc7396402a..e4bed61f1bd2 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -108,7 +108,7 @@ public void setUp() throws IOException { false); config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath()); config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1); - config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, "0s"); + config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s"); scmMetadataStore = new SCMMetadataStoreImpl(config); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java index 5f3dffa2ce69..cdafe912c9f0 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java @@ -81,7 +81,7 @@ private SCMSafeModeManager initializeSafeModeRuleFactory() { final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class); when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class)); OzoneConfiguration conf = new OzoneConfiguration(); - conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_CONTAINER_RULE_REFRESH_INTERVAL, "0s"); + conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s"); SafeModeRuleFactory.initialize(conf, SCMContext.emptyContext(), new EventQueue(), mock( PipelineManager.class), From e2bfca46019ec155f71d26a1ec03902012c363ab Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Tue, 31 Mar 2026 18:29:12 +0530 Subject: [PATCH 08/11] code cleanup --- .../java/org/apache/hadoop/hdds/HddsConfigKeys.java | 3 +-- .../scm/safemode/AbstractContainerSafeModeRule.java | 10 +++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index a6fabbe29186..9b6eb809d565 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -117,8 +117,7 @@ public final class HddsConfigKeys { public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m"; /** - * Interval for background sync of open/closed container sets with ContainerManager - * in {@code AbstractContainerSafeModeRule}. 0 disables the background thread. + * Interval for background refresh of safeMode rules. 0 disables the background thread. */ public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL = "hdds.scm.safemode.container.rule.incremental.sync.interval"; diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 759d6beebd85..997bc35f2460 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -141,8 +141,8 @@ public synchronized void refresh(boolean forceRefresh) { } /** - * Aligns tracked open/closed sets with current {@link ContainerManager} state. - * Runs on the incremental sync thread when configured. + * Refreshes the expected container list when the rule is + * not yet validated and when there are pending transactions to be applied. */ private void refreshExpectedContainers() { if (getSafeModeManager().isScmRatisApplyCaughtUpToCommit()) { @@ -156,11 +156,7 @@ private void refreshExpectedContainers() { @Override protected void cleanup() { - synchronized (this) { - if (containers != null) { - containers.clear(); - } - } + getContainers().clear(); } /** From cafe82ae004329c2211a557712f18e0c620743ee Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Wed, 1 Apr 2026 12:03:16 +0530 Subject: [PATCH 09/11] checkstyle --- .../safemode/AbstractContainerSafeModeRule.java | 2 +- .../hdds/scm/safemode/SCMSafeModeManager.java | 16 ++++++++-------- .../AbstractContainerSafeModeRuleTest.java | 3 +-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 997bc35f2460..157b1e119c46 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -50,6 +50,7 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule 0; + private void startRefreshExecutor(long refreshIntervalMillis) { + final boolean enabled = refreshIntervalMillis > 0; LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms", - enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMs); + enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMillis); if (!enabled) { return; } @@ -146,8 +146,8 @@ private void startRefreshExecutor(long refreshIntervalMs) { .build()); refreshExecutor.scheduleAtFixedRate( () -> refreshAndValidate(refreshExecutor), - refreshIntervalMs, - refreshIntervalMs, + refreshIntervalMillis, + refreshIntervalMillis, TimeUnit.MILLISECONDS); } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java index 70ecf075d186..7e25c0d187e3 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java @@ -54,14 +54,13 @@ public abstract class AbstractContainerSafeModeRuleTest { private List containers; private AbstractContainerSafeModeRule rule; - private SCMSafeModeManager safeModeManager; @BeforeEach public void setup() throws ContainerNotFoundException { final ContainerManager containerManager = mock(ContainerManager.class); final ConfigurationSource conf = mock(ConfigurationSource.class); final EventQueue eventQueue = mock(EventQueue.class); - safeModeManager = mock(SCMSafeModeManager.class); + SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class); final SafeModeMetrics metrics = mock(SafeModeMetrics.class); when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics); From 7e9a8f78673fc53e0e5e8723da040f6036568423 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Wed, 1 Apr 2026 22:32:10 +0530 Subject: [PATCH 10/11] add to default xml --- .../main/java/org/apache/hadoop/hdds/HddsConfigKeys.java | 2 +- hadoop-hdds/common/src/main/resources/ozone-default.xml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java index 9b6eb809d565..d71af31768c1 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java @@ -120,7 +120,7 @@ public final class HddsConfigKeys { * Interval for background refresh of safeMode rules. 0 disables the background thread. */ public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL = - "hdds.scm.safemode.container.rule.incremental.sync.interval"; + "hdds.scm.safemode.rule.refresh.interval"; public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s"; diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml index d63dedb15416..4afc795a6d64 100644 --- a/hadoop-hdds/common/src/main/resources/ozone-default.xml +++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml @@ -1701,6 +1701,13 @@ reported replica before SCM comes out of safe mode. + + hdds.scm.safemode.rule.refresh.interval + 5s + HDDS,SCM,OPERATION + Refresh interval in SCM Safemode. + + hdds.scm.wait.time.after.safemode.exit From c3fb5a5fae29945a9e27ee634806d501ff655458 Mon Sep 17 00:00:00 2001 From: sadanand48 Date: Wed, 1 Apr 2026 23:49:39 +0530 Subject: [PATCH 11/11] add tests --- .../hadoop/hdds/scm/ha/SCMStateMachine.java | 6 -- .../AbstractContainerSafeModeRule.java | 20 +----- .../hdds/scm/safemode/SCMSafeModeManager.java | 35 ----------- .../scm/safemode/TestSCMSafeModeManager.java | 61 +++++++++++++++++++ .../hadoop/ozone/om/TestScmSafeMode.java | 24 ++++++++ 5 files changed, 88 insertions(+), 58 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java index d702bb2a5d46..627e9a296c62 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java @@ -161,12 +161,6 @@ public CompletableFuture applyTransaction( // Ratis client, leaving SCM intact. applyTransactionFuture.completeExceptionally(ex); } - - // After previous term transactions are applied, still in safe mode, - // perform refreshAndValidate to update the safemode rule state. - if (scm.isInSafeMode() && isStateMachineReady.get()) { - scm.getScmSafeModeManager().refreshAndValidate(); - } final TermIndex appliedTermIndex = TermIndex.valueOf(trx.getLogEntry()); transactionBuffer.updateLatestTrxInfo(TransactionInfo.valueOf(appliedTermIndex)); updateLastAppliedTermIndex(appliedTermIndex); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java index 157b1e119c46..09480009455d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java @@ -50,7 +50,7 @@ public abstract class AbstractContainerSafeModeRule extends SafeModeExitRule c.getNumberOfKeys() > 0) .forEach(c -> containers.put(c.containerID(), c.getReplicationConfig().getMinimumNodes())); totalContainers.set(containers.size()); @@ -134,22 +135,7 @@ public double getCurrentContainerThreshold() { @Override public synchronized void refresh(boolean forceRefresh) { - if (forceRefresh) { - initializeRule(); - } else { - refreshExpectedContainers(); - } - } - - /** - * Refreshes the expected container list when the rule is - * not yet validated and when there are pending transactions to be applied. - */ - private void refreshExpectedContainers() { - if (getSafeModeManager().isScmRatisApplyCaughtUpToCommit()) { - return; - } - if (!validate()) { + if (forceRefresh || !validate()) { initializeRule(); } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 0616db61d167..278440bb8df0 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -37,14 +37,10 @@ import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.scm.container.ContainerManager; import org.apache.hadoop.hdds.scm.ha.SCMContext; -import org.apache.hadoop.hdds.scm.ha.SCMHAManager; -import org.apache.hadoop.hdds.scm.ha.SCMRatisServer; import org.apache.hadoop.hdds.scm.ha.SCMService.Event; import org.apache.hadoop.hdds.scm.ha.SCMServiceManager; import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; -import org.apache.hadoop.hdds.scm.server.OzoneStorageContainerManager; -import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.hdds.server.events.EventQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -165,37 +161,6 @@ public SafeModeMetrics getSafeModeMetrics() { return safeModeMetrics; } - /** - * Returns true when the SCM Ratis state machine has applied all committed log entries - * ({@code lastAppliedIndex >= lastCommittedIndex}). In that case incremental - * {@link AbstractContainerSafeModeRule} refresh may skip work. - *

- * Returns false if SCM is not {@link StorageContainerManager}, HA/Ratis is unavailable, - * or the comparison cannot be made. - */ - public boolean isScmRatisApplyCaughtUpToCommit() { - try { - OzoneStorageContainerManager ozoneScm = scmContext.getScm(); - if (!(ozoneScm instanceof StorageContainerManager)) { - return false; - } - SCMHAManager ha = ((StorageContainerManager) ozoneScm).getScmHAManager(); - if (ha == null) { - return false; - } - SCMRatisServer ratis = ha.getRatisServer(); - if (ratis == null) { - return false; - } - long applied = ratis.getSCMStateMachine().getLastAppliedTermIndex().getIndex(); - long committed = ratis.getDivision().getRaftLog().getLastCommittedIndex(); - return applied >= committed; - } catch (Exception e) { - LOG.debug("Could not compare Ratis last applied vs last committed index", e); - return false; - } - } - private void emitSafeModeStatus() { final SafeModeStatus safeModeStatus = status.get(); safeModeMetrics.setScmInSafeMode(safeModeStatus.isInSafeMode()); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index e4bed61f1bd2..b1118cc3689c 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -169,6 +169,67 @@ private void testSafeMode(int numContainers) throws Exception { } + @Test + public void testSafeModeExitWithPeriodicContainerRuleRefresh() throws Exception { + config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "100ms"); + + List ratisContainers = new ArrayList<>(); + ratisContainers.addAll(HddsTestUtils.getContainerInfo(5)); + for (ContainerInfo container : ratisContainers) { + container.setState(HddsProtos.LifeCycleState.CLOSED); + container.setNumberOfKeys(10); + } + + ContainerManager containerManager = mock(ContainerManager.class); + when(containerManager.getContainers(ReplicationType.RATIS)) + .thenAnswer(invocation -> new ArrayList<>(ratisContainers)); + when(containerManager.getContainers(ReplicationType.EC)) + .thenReturn(Collections.emptyList()); + + scmSafeModeManager = new SCMSafeModeManager(config, null, null, containerManager, + serviceManager, queue, scmContext); + scmSafeModeManager.start(); + + assertTrue(scmSafeModeManager.getInSafeMode()); + + RatisContainerSafeModeRule ratisRule = SafeModeRuleFactory.getInstance() + .getSafeModeRule(RatisContainerSafeModeRule.class); + assertEquals(5, ratisRule.getTotalNumberOfContainers(), + "initial Ratis container count from ContainerManager"); + + ratisContainers.addAll(HddsTestUtils.getContainerInfo(5)); + for (int i = 5; i < ratisContainers.size(); i++) { + ratisContainers.get(i).setState(HddsProtos.LifeCycleState.CLOSED); + ratisContainers.get(i).setNumberOfKeys(10); + } + + GenericTestUtils.waitFor( + () -> ratisRule.getTotalNumberOfContainers() == 10, + 100, + 15000); + + SCMDatanodeProtocolServer.NodeRegistrationContainerReport report = + HddsTestUtils.createNodeRegistrationContainerReport(ratisContainers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, report); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, report); + + long cutOff = (long) Math.ceil(10 * config.getDouble( + HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT, + HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT)); + + assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics() + .getNumContainerWithOneReplicaReportedThreshold().value()); + + GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(), + 100, 1000 * 30); + GenericTestUtils.waitFor(() -> + scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0, + 100, 1000 * 5); + + assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics() + .getCurrentContainersWithOneReplicaReportedCount().value()); + } + @Test public void testSafeModeExitRule() throws Exception { containers = new ArrayList<>(); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java index c5f30fdb8957..00089c1cc269 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java @@ -19,6 +19,7 @@ import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL; import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE; import static org.apache.hadoop.hdds.client.ReplicationType.RATIS; import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; @@ -196,6 +197,29 @@ void testIsScmInSafeModeAndForceExit() throws Exception { } + @Test + void testClusterExitsSafeModeWithPeriodicRuleRefresh() throws Exception { + cluster.shutdown(); + conf.set(HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "1s"); + builder = MiniOzoneCluster.newBuilder(conf).setStartDataNodes(true); + cluster = builder.build(); + cluster.waitForClusterToBeReady(); + final StorageContainerManager scm = cluster.getStorageContainerManager(); + TestDataUtil.createKeys(cluster, 100); + GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3, + 100, 1000 * 30); + + cluster.restartStorageContainerManager(false); + + assertTrue(cluster.getStorageContainerManager().isInSafeMode(), "SCM should start in safe mode"); + GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3, + 100, 1000 * 15); + + cluster.waitTobeOutOfSafeMode(); + + assertFalse(scm.isInSafeMode(), "SCM should exit safe mode with periodic rule refresh enabled"); + } + @Test void testSCMSafeMode() throws Exception { // Test1: Test safe mode when there are no containers in system.