Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.ZkNodeProps;
Expand Down Expand Up @@ -61,7 +60,10 @@ public void testSimpleSliceLeaderElection() throws Exception {
String collection = "collection1";
createCollection(collection);

cluster.waitForActiveCollection(collection, 10, TimeUnit.SECONDS, 2, 6);
waitForState(
"Timeout waiting for collection to become active",
collection,
clusterShape(2, NUM_REPLICAS_OF_SHARD1 + 1));
List<JettySolrRunner> stoppedRunners = new ArrayList<>();
for (int i = 0; i < 4; i++) {
// who is the leader?
Expand Down Expand Up @@ -107,15 +109,20 @@ public void testSimpleSliceLeaderElection() throws Exception {
assertNotNull(jetty);
cluster.expireZkSession(jetty);

for (int i = 0; i < 60; i++) { // wait till leader is changed
if (jetty != getRunner(getLeader(collection))) {
break;
}
Thread.sleep(100);
}

// make sure we have waited long enough for the first leader to have come back
Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100);
// Wait until leadership has moved away from the expired-session node
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the waitForState instead of the thread.sleep

waitForState(
"Expected leader to move away after expiring zk session",
collection,
c -> {
var l = c.getLeader("shard1");
return l != null && !jetty.getNodeName().equals(l.getNodeName());
});

// Wait until the expired-session node is live again before stopping others
waitForState(
"Expected expired-session node to rejoin live nodes",
collection,
(liveNodes, c) -> liveNodes.contains(jetty.getNodeName()));

// kill everyone but the first leader that should have reconnected by now
for (JettySolrRunner jetty2 : cluster.getJettySolrRunners()) {
Expand All @@ -124,18 +131,13 @@ public void testSimpleSliceLeaderElection() throws Exception {
}
}

for (int i = 0; i < 320; i++) { // wait till leader is changed
try {
if (jetty == getRunner(getLeader(collection))) {
break;
}
Thread.sleep(100);
} catch (Exception e) {
continue;
}
}

assertEquals(jetty, getRunner(getLeader(collection)));
waitForState(
"Expected original node to become leader after others stopped",
collection,
c -> {
var l = c.getLeader("shard1");
return l != null && jetty.getNodeName().equals(l.getNodeName());
});
}

private JettySolrRunner getRunner(String nodeName) {
Expand Down
19 changes: 6 additions & 13 deletions solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
import org.apache.solr.client.solrj.request.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.embedded.JettySolrRunner;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.slf4j.Logger;
Expand Down Expand Up @@ -60,12 +58,16 @@ public void testRestartZkWhenClusterDown() throws Exception {
// This attempt will fail since it will time out after 1 second
System.setProperty("solr.cloud.wait.for.zk.seconds", "1");
restartSolrAndZk();
waitForLiveNodes(0);
waitForState(
"Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> liveNodes.isEmpty());

// This attempt will succeed since there will be enough time to connect
System.setProperty("solr.cloud.wait.for.zk.seconds", "20");
restartSolrAndZk();
waitForLiveNodes(cluster.getJettySolrRunners().size());
waitForState(
"Timeout waiting for all nodes to come up",
coll,
(liveNodes, c) -> liveNodes.size() == cluster.getJettySolrRunners().size());
waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2));
QueryResponse rsp =
new QueryRequest(new SolrQuery("*:*")).process(cluster.getSolrClient(), coll);
Expand Down Expand Up @@ -100,13 +102,4 @@ private void restartSolrAndZk() throws Exception {
}
}

private void waitForLiveNodes(int numNodes) throws InterruptedException, KeeperException {
ZkStateReader zkStateReader = cluster.getZkStateReader();
for (int i = 0; i < 100; i++) {
zkStateReader.updateLiveNodes();
if (zkStateReader.getClusterState().getLiveNodes().size() == numNodes) return;
Thread.sleep(200);
}
fail("Timeout waiting for number of live nodes = " + numNodes);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,20 @@ public static void setupCluster() throws Exception {
CollectionAdminRequest.createCollection(COLLECTION, "conf", NUM_SHARDS, NUM_REPLICAS)
.process(cluster.getSolrClient())
.getStatus());
cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS);
waitForState(
"Timeout waiting for collection to be active after creation",
COLLECTION,
clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
}

@Before
public void waitForActiveState() throws Exception {
CollectionAdminRequest.modifyCollection(COLLECTION, Map.of("readOnly", false))
.process(cluster.getSolrClient());
cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS);
waitForState(
"Timeout waiting for active collection",
COLLECTION,
clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
}

@Test
Expand Down
Loading