diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index 0473d6da36ab..d71af31768c1 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -116,6 +116,14 @@ public final class HddsConfigKeys {
"hdds.scm.safemode.log.interval";
public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";
+ /**
+ * Interval for background refresh of safeMode rules. 0 disables the background thread.
+ */
+ public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
+ "hdds.scm.safemode.rule.refresh.interval";
+ public static final String
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";
+
// This configuration setting is used as a fallback location by all
// Ozone/HDDS services for their metadata. It is useful as a single
// config point for test/PoC clusters.
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index d63dedb15416..4afc795a6d64 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -1701,6 +1701,13 @@
reported replica before SCM comes out of safe mode.
+
+ hdds.scm.safemode.rule.refresh.interval
+ 5s
+ HDDS,SCM,OPERATION
+ Refresh interval in SCM Safemode.
+
+
hdds.scm.wait.time.after.safemode.exit
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
index d702bb2a5d46..627e9a296c62 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
@@ -161,12 +161,6 @@ public CompletableFuture applyTransaction(
// Ratis client, leaving SCM intact.
applyTransactionFuture.completeExceptionally(ex);
}
-
- // After previous term transactions are applied, still in safe mode,
- // perform refreshAndValidate to update the safemode rule state.
- if (scm.isInSafeMode() && isStateMachineReady.get()) {
- scm.getScmSafeModeManager().refreshAndValidate();
- }
final TermIndex appliedTermIndex = TermIndex.valueOf(trx.getLogEntry());
transactionBuffer.updateLatestTrxInfo(TransactionInfo.valueOf(appliedTermIndex));
updateLastAppliedTermIndex(appliedTermIndex);
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 2c9173b2bf09..278440bb8df0 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -19,6 +19,8 @@
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED_DEFAULT;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.util.HashMap;
@@ -88,6 +90,7 @@ public class SCMSafeModeManager implements SafeModeManager {
private long safeModeLogIntervalMs;
private ScheduledExecutorService safeModeLogExecutor;
private ScheduledFuture> safeModeLogTask;
+ private final long refreshIntervalMs;
public SCMSafeModeManager(final ConfigurationSource conf,
final NodeManager nodeManager,
@@ -117,6 +120,31 @@ public SCMSafeModeManager(final ConfigurationSource conf,
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
emitSafeModeStatus();
}
+
+ this.refreshIntervalMs = conf.getTimeDuration(
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+ TimeUnit.MILLISECONDS);
+ startRefreshExecutor(refreshIntervalMs);
+ }
+
+ private void startRefreshExecutor(long refreshIntervalMillis) {
+ final boolean enabled = refreshIntervalMillis > 0;
+ LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms",
+ enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMillis);
+ if (!enabled) {
+ return;
+ }
+ final ScheduledExecutorService refreshExecutor = Executors.newSingleThreadScheduledExecutor(
+ new ThreadFactoryBuilder()
+ .setDaemon(true)
+ .setNameFormat(getClass().getSimpleName() + "-refresh-%d")
+ .build());
+ refreshExecutor.scheduleAtFixedRate(
+ () -> refreshAndValidate(refreshExecutor),
+ refreshIntervalMillis,
+ refreshIntervalMillis,
+ TimeUnit.MILLISECONDS);
}
public void start() {
@@ -183,6 +211,13 @@ public synchronized void validateSafeModeExitRules(String ruleName) {
public void forceExitSafeMode() {
LOG.info("SCM force-exiting safe mode.");
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
+ exitRules.values().forEach(rule -> {
+ try {
+ rule.cleanup();
+ } catch (Exception e) {
+ LOG.warn("Safe mode exit rule cleanup failed for {}", rule.getRuleName(), e);
+ }
+ });
emitSafeModeStatus();
}
@@ -204,6 +239,13 @@ public void refresh() {
* Refresh Rule state and validate rules.
*/
public void refreshAndValidate() {
+ if (refreshIntervalMs > 0) {
+ return; // use executor to refresh
+ }
+ refreshAndValidate(null);
+ }
+
+ private void refreshAndValidate(ScheduledExecutorService refreshExecutor) {
if (getInSafeMode()) {
exitRules.values().forEach(rule -> {
rule.refresh(false);
@@ -212,6 +254,8 @@ public void refreshAndValidate() {
rule.cleanup();
}
});
+ } else if (refreshExecutor != null) {
+ refreshExecutor.shutdownNow(); // Not in safemode
}
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
index 8535fbdf15a9..e28ca90ffe1a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
@@ -68,6 +68,10 @@ public String getRuleName() {
return ruleName;
}
+ protected final SCMSafeModeManager getSafeModeManager() {
+ return safeModeManager;
+ }
+
/**
* Return's the event type this safeMode exit rule handles.
* @return TypedEvent
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
index 7bfdecc71964..7e25c0d187e3 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
@@ -17,6 +17,8 @@
package org.apache.hadoop.hdds.scm.safemode;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -27,6 +29,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.DatanodeID;
@@ -57,10 +60,14 @@ public void setup() throws ContainerNotFoundException {
final ContainerManager containerManager = mock(ContainerManager.class);
final ConfigurationSource conf = mock(ConfigurationSource.class);
final EventQueue eventQueue = mock(EventQueue.class);
- final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
+ SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
final SafeModeMetrics metrics = mock(SafeModeMetrics.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics);
+ when(conf.getTimeDuration(
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+ TimeUnit.MILLISECONDS)).thenReturn(0L);
containers = new ArrayList<>();
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
when(containerManager.getContainer(any(ContainerID.class))).thenAnswer(invocation -> {
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index fdf38a7a67ca..b1118cc3689c 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -108,6 +108,7 @@ public void setUp() throws IOException {
false);
config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath());
config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1);
+ config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
scmMetadataStore = new SCMMetadataStoreImpl(config);
}
@@ -168,6 +169,67 @@ private void testSafeMode(int numContainers) throws Exception {
}
+ @Test
+ public void testSafeModeExitWithPeriodicContainerRuleRefresh() throws Exception {
+ config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "100ms");
+
+ List ratisContainers = new ArrayList<>();
+ ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
+ for (ContainerInfo container : ratisContainers) {
+ container.setState(HddsProtos.LifeCycleState.CLOSED);
+ container.setNumberOfKeys(10);
+ }
+
+ ContainerManager containerManager = mock(ContainerManager.class);
+ when(containerManager.getContainers(ReplicationType.RATIS))
+ .thenAnswer(invocation -> new ArrayList<>(ratisContainers));
+ when(containerManager.getContainers(ReplicationType.EC))
+ .thenReturn(Collections.emptyList());
+
+ scmSafeModeManager = new SCMSafeModeManager(config, null, null, containerManager,
+ serviceManager, queue, scmContext);
+ scmSafeModeManager.start();
+
+ assertTrue(scmSafeModeManager.getInSafeMode());
+
+ RatisContainerSafeModeRule ratisRule = SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(RatisContainerSafeModeRule.class);
+ assertEquals(5, ratisRule.getTotalNumberOfContainers(),
+ "initial Ratis container count from ContainerManager");
+
+ ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
+ for (int i = 5; i < ratisContainers.size(); i++) {
+ ratisContainers.get(i).setState(HddsProtos.LifeCycleState.CLOSED);
+ ratisContainers.get(i).setNumberOfKeys(10);
+ }
+
+ GenericTestUtils.waitFor(
+ () -> ratisRule.getTotalNumberOfContainers() == 10,
+ 100,
+ 15000);
+
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport report =
+ HddsTestUtils.createNodeRegistrationContainerReport(ratisContainers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, report);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, report);
+
+ long cutOff = (long) Math.ceil(10 * config.getDouble(
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT));
+
+ assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+ .getNumContainerWithOneReplicaReportedThreshold().value());
+
+ GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
+ 100, 1000 * 30);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
+ 100, 1000 * 5);
+
+ assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+ .getCurrentContainersWithOneReplicaReportedCount().value());
+ }
+
@Test
public void testSafeModeExitRule() throws Exception {
containers = new ArrayList<>();
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index f795a6c57628..cdafe912c9f0 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -23,6 +23,7 @@
import static org.mockito.Mockito.when;
import java.lang.reflect.Field;
+import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
@@ -79,7 +80,9 @@ public void testLoadedPreCheckRules() {
private SCMSafeModeManager initializeSafeModeRuleFactory() {
final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
- SafeModeRuleFactory.initialize(new OzoneConfiguration(),
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
+ SafeModeRuleFactory.initialize(conf,
SCMContext.emptyContext(), new EventQueue(), mock(
PipelineManager.class),
mock(ContainerManager.class), mock(NodeManager.class));
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
index c5f30fdb8957..00089c1cc269 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
@@ -19,6 +19,7 @@
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
@@ -196,6 +197,29 @@ void testIsScmInSafeModeAndForceExit() throws Exception {
}
+ @Test
+ void testClusterExitsSafeModeWithPeriodicRuleRefresh() throws Exception {
+ cluster.shutdown();
+ conf.set(HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "1s");
+ builder = MiniOzoneCluster.newBuilder(conf).setStartDataNodes(true);
+ cluster = builder.build();
+ cluster.waitForClusterToBeReady();
+ final StorageContainerManager scm = cluster.getStorageContainerManager();
+ TestDataUtil.createKeys(cluster, 100);
+ GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3,
+ 100, 1000 * 30);
+
+ cluster.restartStorageContainerManager(false);
+
+ assertTrue(cluster.getStorageContainerManager().isInSafeMode(), "SCM should start in safe mode");
+ GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3,
+ 100, 1000 * 15);
+
+ cluster.waitTobeOutOfSafeMode();
+
+ assertFalse(scm.isInSafeMode(), "SCM should exit safe mode with periodic rule refresh enabled");
+ }
+
@Test
void testSCMSafeMode() throws Exception {
// Test1: Test safe mode when there are no containers in system.