Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ public final class HddsConfigKeys {
"hdds.scm.safemode.log.interval";
public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";

/**
* Interval for background refresh of safeMode rules. 0 disables the background thread.
*/
public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
"hdds.scm.safemode.rule.refresh.interval";
public static final String
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";

// This configuration setting is used as a fallback location by all
// Ozone/HDDS services for their metadata. It is useful as a single
// config point for test/PoC clusters.
Expand Down
7 changes: 7 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1701,6 +1701,13 @@
reported replica before SCM comes out of safe mode.
</description>
</property>
<property>
<name>hdds.scm.safemode.rule.refresh.interval</name>
<value>5s</value>
<tag>HDDS,SCM,OPERATION</tag>
<description> Refresh interval in SCM Safemode.
</description>
</property>

<property>
<name>hdds.scm.wait.time.after.safemode.exit</name>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,6 @@ public CompletableFuture<Message> applyTransaction(
// Ratis client, leaving SCM intact.
applyTransactionFuture.completeExceptionally(ex);
}

// After previous term transactions are applied, still in safe mode,
// perform refreshAndValidate to update the safemode rule state.
if (scm.isInSafeMode() && isStateMachineReady.get()) {
scm.getScmSafeModeManager().refreshAndValidate();
}
final TermIndex appliedTermIndex = TermIndex.valueOf(trx.getLogEntry());
transactionBuffer.updateLatestTrxInfo(TransactionInfo.valueOf(appliedTermIndex));
updateLastAppliedTermIndex(appliedTermIndex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED_DEFAULT;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;

import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.util.HashMap;
Expand Down Expand Up @@ -88,6 +90,7 @@ public class SCMSafeModeManager implements SafeModeManager {
private long safeModeLogIntervalMs;
private ScheduledExecutorService safeModeLogExecutor;
private ScheduledFuture<?> safeModeLogTask;
private final long refreshIntervalMs;

public SCMSafeModeManager(final ConfigurationSource conf,
final NodeManager nodeManager,
Expand Down Expand Up @@ -117,6 +120,31 @@ public SCMSafeModeManager(final ConfigurationSource conf,
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
emitSafeModeStatus();
}

this.refreshIntervalMs = conf.getTimeDuration(
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
startRefreshExecutor(refreshIntervalMs);
}

private void startRefreshExecutor(long refreshIntervalMillis) {
final boolean enabled = refreshIntervalMillis > 0;
LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms",
enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMillis);
if (!enabled) {
return;
}
final ScheduledExecutorService refreshExecutor = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder()
.setDaemon(true)
.setNameFormat(getClass().getSimpleName() + "-refresh-%d")
.build());
refreshExecutor.scheduleAtFixedRate(
() -> refreshAndValidate(refreshExecutor),
refreshIntervalMillis,
refreshIntervalMillis,
TimeUnit.MILLISECONDS);
}

public void start() {
Expand Down Expand Up @@ -183,6 +211,13 @@ public synchronized void validateSafeModeExitRules(String ruleName) {
public void forceExitSafeMode() {
LOG.info("SCM force-exiting safe mode.");
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
exitRules.values().forEach(rule -> {
try {
rule.cleanup();
} catch (Exception e) {
LOG.warn("Safe mode exit rule cleanup failed for {}", rule.getRuleName(), e);
}
});
emitSafeModeStatus();
}

Expand All @@ -204,6 +239,13 @@ public void refresh() {
* Refresh Rule state and validate rules.
*/
public void refreshAndValidate() {
if (refreshIntervalMs > 0) {
return; // use executor to refresh
}
refreshAndValidate(null);
}

private void refreshAndValidate(ScheduledExecutorService refreshExecutor) {
if (getInSafeMode()) {
exitRules.values().forEach(rule -> {
rule.refresh(false);
Expand All @@ -212,6 +254,8 @@ public void refreshAndValidate() {
rule.cleanup();
}
});
} else if (refreshExecutor != null) {
refreshExecutor.shutdownNow(); // Not in safemode
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ public String getRuleName() {
return ruleName;
}

protected final SCMSafeModeManager getSafeModeManager() {
return safeModeManager;
}

/**
* Return's the event type this safeMode exit rule handles.
* @return TypedEvent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.hadoop.hdds.scm.safemode;

import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
Expand All @@ -27,6 +29,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.DatanodeID;
Expand Down Expand Up @@ -57,10 +60,14 @@ public void setup() throws ContainerNotFoundException {
final ContainerManager containerManager = mock(ContainerManager.class);
final ConfigurationSource conf = mock(ConfigurationSource.class);
final EventQueue eventQueue = mock(EventQueue.class);
final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
final SafeModeMetrics metrics = mock(SafeModeMetrics.class);

when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics);
when(conf.getTimeDuration(
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS)).thenReturn(0L);
containers = new ArrayList<>();
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
when(containerManager.getContainer(any(ContainerID.class))).thenAnswer(invocation -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ public void setUp() throws IOException {
false);
config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath());
config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1);
config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
scmMetadataStore = new SCMMetadataStoreImpl(config);
}

Expand Down Expand Up @@ -168,6 +169,67 @@ private void testSafeMode(int numContainers) throws Exception {

}

@Test
public void testSafeModeExitWithPeriodicContainerRuleRefresh() throws Exception {
config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "100ms");

List<ContainerInfo> ratisContainers = new ArrayList<>();
ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
for (ContainerInfo container : ratisContainers) {
container.setState(HddsProtos.LifeCycleState.CLOSED);
container.setNumberOfKeys(10);
}

ContainerManager containerManager = mock(ContainerManager.class);
when(containerManager.getContainers(ReplicationType.RATIS))
.thenAnswer(invocation -> new ArrayList<>(ratisContainers));
when(containerManager.getContainers(ReplicationType.EC))
.thenReturn(Collections.emptyList());

scmSafeModeManager = new SCMSafeModeManager(config, null, null, containerManager,
serviceManager, queue, scmContext);
scmSafeModeManager.start();

assertTrue(scmSafeModeManager.getInSafeMode());

RatisContainerSafeModeRule ratisRule = SafeModeRuleFactory.getInstance()
.getSafeModeRule(RatisContainerSafeModeRule.class);
assertEquals(5, ratisRule.getTotalNumberOfContainers(),
"initial Ratis container count from ContainerManager");

ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
for (int i = 5; i < ratisContainers.size(); i++) {
ratisContainers.get(i).setState(HddsProtos.LifeCycleState.CLOSED);
ratisContainers.get(i).setNumberOfKeys(10);
}

GenericTestUtils.waitFor(
() -> ratisRule.getTotalNumberOfContainers() == 10,
100,
15000);

SCMDatanodeProtocolServer.NodeRegistrationContainerReport report =
HddsTestUtils.createNodeRegistrationContainerReport(ratisContainers);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, report);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, report);

long cutOff = (long) Math.ceil(10 * config.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT));

assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
.getNumContainerWithOneReplicaReportedThreshold().value());

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 30);
GenericTestUtils.waitFor(() ->
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() == 0,
100, 1000 * 5);

assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
.getCurrentContainersWithOneReplicaReportedCount().value());
}

@Test
public void testSafeModeExitRule() throws Exception {
containers = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import static org.mockito.Mockito.when;

import java.lang.reflect.Field;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
Expand Down Expand Up @@ -79,7 +80,9 @@ public void testLoadedPreCheckRules() {
private SCMSafeModeManager initializeSafeModeRuleFactory() {
final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
SafeModeRuleFactory.initialize(new OzoneConfiguration(),
OzoneConfiguration conf = new OzoneConfiguration();
conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
SafeModeRuleFactory.initialize(conf,
SCMContext.emptyContext(), new EventQueue(), mock(
PipelineManager.class),
mock(ContainerManager.class), mock(NodeManager.class));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
Expand Down Expand Up @@ -196,6 +197,29 @@ void testIsScmInSafeModeAndForceExit() throws Exception {

}

@Test
void testClusterExitsSafeModeWithPeriodicRuleRefresh() throws Exception {
cluster.shutdown();
conf.set(HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "1s");
builder = MiniOzoneCluster.newBuilder(conf).setStartDataNodes(true);
cluster = builder.build();
cluster.waitForClusterToBeReady();
final StorageContainerManager scm = cluster.getStorageContainerManager();
TestDataUtil.createKeys(cluster, 100);
GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3,
100, 1000 * 30);

cluster.restartStorageContainerManager(false);

assertTrue(cluster.getStorageContainerManager().isInSafeMode(), "SCM should start in safe mode");
GenericTestUtils.waitFor(() -> scm.getContainerManager().getContainers().size() >= 3,
100, 1000 * 15);

cluster.waitTobeOutOfSafeMode();

assertFalse(scm.isInSafeMode(), "SCM should exit safe mode with periodic rule refresh enabled");
}

@Test
void testSCMSafeMode() throws Exception {
// Test1: Test safe mode when there are no containers in system.
Expand Down