Skip to content

Commit 4407326

Browse files
committed
Fix MongoDB job restart failure after abrupt shutdown
Fixes #4943 where job restart fails after abrupt shutdown because getLastStepExecution() only checks the empty stepExecutions array in BATCH_JOB_EXECUTION instead of the BATCH_STEP_EXECUTION collection. Changes: - getLastStepExecution(): Query BATCH_STEP_EXECUTION collection directly - countStepExecutions(): Query collection instead of embedded array - Align with JDBC implementation behavior Signed-off-by: baezzys <wlsdn3578@gmail.com>
1 parent 3da06e5 commit 4407326

File tree

2 files changed

+155
-50
lines changed

2 files changed

+155
-50
lines changed

spring-batch-core/src/main/java/org/springframework/batch/core/repository/dao/mongodb/MongoStepExecutionDao.java

Lines changed: 58 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,16 @@
1515
*/
1616
package org.springframework.batch.core.repository.dao.mongodb;
1717

18-
import java.util.ArrayList;
1918
import java.util.Collection;
20-
import java.util.Comparator;
2119
import java.util.List;
22-
import java.util.Optional;
2320

2421
import org.springframework.batch.core.job.JobExecution;
2522
import org.springframework.batch.core.job.JobInstance;
2623
import org.springframework.batch.core.step.StepExecution;
2724
import org.springframework.batch.core.repository.dao.StepExecutionDao;
2825
import org.springframework.batch.core.repository.persistence.converter.JobExecutionConverter;
2926
import org.springframework.batch.core.repository.persistence.converter.StepExecutionConverter;
27+
import org.springframework.data.domain.Sort;
3028
import org.springframework.data.mongodb.core.MongoOperations;
3129
import org.springframework.data.mongodb.core.query.Query;
3230
import org.springframework.jdbc.support.incrementer.DataFieldMaxValueIncrementer;
@@ -36,6 +34,7 @@
3634

3735
/**
3836
* @author Mahmoud Ben Hassine
37+
* @author Jinwoo Bae
3938
* @since 5.2.0
4039
*/
4140
public class MongoStepExecutionDao implements StepExecutionDao {
@@ -66,7 +65,7 @@ public void setStepExecutionIncrementer(DataFieldMaxValueIncrementer stepExecuti
6665
@Override
6766
public void saveStepExecution(StepExecution stepExecution) {
6867
org.springframework.batch.core.repository.persistence.StepExecution stepExecutionToSave = this.stepExecutionConverter
69-
.fromStepExecution(stepExecution);
68+
.fromStepExecution(stepExecution);
7069
long stepExecutionId = this.stepExecutionIncrementer.nextLongValue();
7170
stepExecutionToSave.setStepExecutionId(stepExecutionId);
7271
this.mongoOperations.insert(stepExecutionToSave, STEP_EXECUTIONS_COLLECTION_NAME);
@@ -84,82 +83,91 @@ public void saveStepExecutions(Collection<StepExecution> stepExecutions) {
8483
public void updateStepExecution(StepExecution stepExecution) {
8584
Query query = query(where("stepExecutionId").is(stepExecution.getId()));
8685
org.springframework.batch.core.repository.persistence.StepExecution stepExecutionToUpdate = this.stepExecutionConverter
87-
.fromStepExecution(stepExecution);
86+
.fromStepExecution(stepExecution);
8887
this.mongoOperations.findAndReplace(query, stepExecutionToUpdate, STEP_EXECUTIONS_COLLECTION_NAME);
8988
}
9089

9190
@Override
9291
public StepExecution getStepExecution(JobExecution jobExecution, Long stepExecutionId) {
9392
Query query = query(where("stepExecutionId").is(stepExecutionId));
9493
org.springframework.batch.core.repository.persistence.StepExecution stepExecution = this.mongoOperations
95-
.findOne(query, org.springframework.batch.core.repository.persistence.StepExecution.class,
96-
STEP_EXECUTIONS_COLLECTION_NAME);
94+
.findOne(query, org.springframework.batch.core.repository.persistence.StepExecution.class,
95+
STEP_EXECUTIONS_COLLECTION_NAME);
9796
return stepExecution != null ? this.stepExecutionConverter.toStepExecution(stepExecution, jobExecution) : null;
9897
}
9998

10099
@Override
101100
public StepExecution getLastStepExecution(JobInstance jobInstance, String stepName) {
102101
// TODO optimize the query
103-
// get all step executions
104-
List<org.springframework.batch.core.repository.persistence.StepExecution> stepExecutions = new ArrayList<>();
105-
Query query = query(where("jobInstanceId").is(jobInstance.getId()));
102+
Query jobExecutionQuery = query(where("jobInstanceId").is(jobInstance.getId()));
106103
List<org.springframework.batch.core.repository.persistence.JobExecution> jobExecutions = this.mongoOperations
107-
.find(query, org.springframework.batch.core.repository.persistence.JobExecution.class,
108-
JOB_EXECUTIONS_COLLECTION_NAME);
109-
for (org.springframework.batch.core.repository.persistence.JobExecution jobExecution : jobExecutions) {
110-
stepExecutions.addAll(jobExecution.getStepExecutions());
104+
.find(jobExecutionQuery, org.springframework.batch.core.repository.persistence.JobExecution.class,
105+
JOB_EXECUTIONS_COLLECTION_NAME);
106+
107+
if (jobExecutions.isEmpty()) {
108+
return null;
109+
}
110+
111+
List<Long> jobExecutionIds = jobExecutions.stream()
112+
.map(org.springframework.batch.core.repository.persistence.JobExecution::getJobExecutionId)
113+
.toList();
114+
115+
Query stepExecutionQuery = query(where("name").is(stepName).and("jobExecutionId").in(jobExecutionIds))
116+
.with(Sort.by(Sort.Direction.DESC, "createTime", "stepExecutionId"))
117+
.limit(1);
118+
119+
org.springframework.batch.core.repository.persistence.StepExecution stepExecution = this.mongoOperations
120+
.findOne(stepExecutionQuery, org.springframework.batch.core.repository.persistence.StepExecution.class,
121+
STEP_EXECUTIONS_COLLECTION_NAME);
122+
123+
if (stepExecution == null) {
124+
return null;
111125
}
112-
// sort step executions by creation date then id (see contract) and return the
113-
// first one
114-
Optional<org.springframework.batch.core.repository.persistence.StepExecution> lastStepExecution = stepExecutions
115-
.stream()
116-
.filter(stepExecution -> stepExecution.getName().equals(stepName))
117-
.min(Comparator
118-
.comparing(org.springframework.batch.core.repository.persistence.StepExecution::getCreateTime)
119-
.thenComparing(org.springframework.batch.core.repository.persistence.StepExecution::getId));
120-
if (lastStepExecution.isPresent()) {
121-
org.springframework.batch.core.repository.persistence.StepExecution stepExecution = lastStepExecution.get();
122-
JobExecution jobExecution = this.jobExecutionConverter.toJobExecution(jobExecutions.stream()
126+
127+
org.springframework.batch.core.repository.persistence.JobExecution jobExecution = jobExecutions.stream()
123128
.filter(execution -> execution.getJobExecutionId().equals(stepExecution.getJobExecutionId()))
124129
.findFirst()
125-
.get(), jobInstance);
126-
return this.stepExecutionConverter.toStepExecution(stepExecution, jobExecution);
127-
}
128-
else {
129-
return null;
130+
.orElse(null);
131+
132+
if (jobExecution != null) {
133+
JobExecution jobExecutionDomain = this.jobExecutionConverter.toJobExecution(jobExecution, jobInstance);
134+
return this.stepExecutionConverter.toStepExecution(stepExecution, jobExecutionDomain);
130135
}
136+
137+
return null;
131138
}
132139

133140
@Override
134141
public void addStepExecutions(JobExecution jobExecution) {
135142
Query query = query(where("jobExecutionId").is(jobExecution.getId()));
136143
List<StepExecution> stepExecutions = this.mongoOperations
137-
.find(query, org.springframework.batch.core.repository.persistence.StepExecution.class,
138-
STEP_EXECUTIONS_COLLECTION_NAME)
139-
.stream()
140-
.map(stepExecution -> this.stepExecutionConverter.toStepExecution(stepExecution, jobExecution))
141-
.toList();
144+
.find(query, org.springframework.batch.core.repository.persistence.StepExecution.class,
145+
STEP_EXECUTIONS_COLLECTION_NAME)
146+
.stream()
147+
.map(stepExecution -> this.stepExecutionConverter.toStepExecution(stepExecution, jobExecution))
148+
.toList();
142149
jobExecution.addStepExecutions(stepExecutions);
143150
}
144151

145152
@Override
146153
public long countStepExecutions(JobInstance jobInstance, String stepName) {
147-
long count = 0;
148-
// TODO optimize the count query
149-
Query query = query(where("jobInstanceId").is(jobInstance.getId()));
150-
List<org.springframework.batch.core.repository.persistence.JobExecution> jobExecutions = this.mongoOperations
151-
.find(query, org.springframework.batch.core.repository.persistence.JobExecution.class,
152-
JOB_EXECUTIONS_COLLECTION_NAME);
153-
for (org.springframework.batch.core.repository.persistence.JobExecution jobExecution : jobExecutions) {
154-
List<org.springframework.batch.core.repository.persistence.StepExecution> stepExecutions = jobExecution
155-
.getStepExecutions();
156-
for (org.springframework.batch.core.repository.persistence.StepExecution stepExecution : stepExecutions) {
157-
if (stepExecution.getName().equals(stepName)) {
158-
count++;
159-
}
160-
}
154+
Query jobExecutionQuery = query(where("jobInstanceId").is(jobInstance.getId()));
155+
List<Long> jobExecutionIds = this.mongoOperations
156+
.find(jobExecutionQuery, org.springframework.batch.core.repository.persistence.JobExecution.class,
157+
JOB_EXECUTIONS_COLLECTION_NAME)
158+
.stream()
159+
.map(org.springframework.batch.core.repository.persistence.JobExecution::getJobExecutionId)
160+
.toList();
161+
162+
if (jobExecutionIds.isEmpty()) {
163+
return 0;
161164
}
162-
return count;
165+
166+
// Count step executions directly from BATCH_STEP_EXECUTION collection
167+
Query stepQuery = query(where("name").is(stepName).and("jobExecutionId").in(jobExecutionIds));
168+
return this.mongoOperations.count(stepQuery,
169+
org.springframework.batch.core.repository.persistence.StepExecution.class,
170+
STEP_EXECUTIONS_COLLECTION_NAME);
163171
}
164172

165173
}

spring-batch-core/src/test/java/org/springframework/batch/core/repository/support/MongoDBJobRepositoryIntegrationTests.java

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package org.springframework.batch.core.repository.support;
1717

1818
import java.time.LocalDateTime;
19+
import java.util.Collections;
1920
import java.util.Map;
2021

2122
import com.mongodb.client.MongoCollection;
@@ -29,14 +30,22 @@
2930
import org.springframework.test.context.junit.jupiter.SpringJUnitConfig;
3031
import org.testcontainers.junit.jupiter.Testcontainers;
3132

33+
import org.springframework.batch.core.BatchStatus;
3234
import org.springframework.batch.core.ExitStatus;
3335
import org.springframework.batch.core.job.Job;
3436
import org.springframework.batch.core.job.JobExecution;
37+
import org.springframework.batch.core.job.JobInstance;
3538
import org.springframework.batch.core.job.parameters.JobParameters;
3639
import org.springframework.batch.core.job.parameters.JobParametersBuilder;
3740
import org.springframework.batch.core.launch.JobOperator;
41+
import org.springframework.batch.core.repository.JobRepository;
42+
import org.springframework.batch.core.repository.dao.StepExecutionDao;
43+
import org.springframework.batch.core.step.StepExecution;
3844
import org.springframework.beans.factory.annotation.Autowired;
3945
import org.springframework.data.mongodb.core.MongoTemplate;
46+
import org.springframework.data.mongodb.core.query.Criteria;
47+
import org.springframework.data.mongodb.core.query.Query;
48+
import org.springframework.data.mongodb.core.query.Update;
4049

4150
/**
4251
* @author Mahmoud Ben Hassine
@@ -112,6 +121,94 @@ void testJobExecution(@Autowired JobOperator jobOperator, @Autowired Job job) th
112121
dump(stepExecutionsCollection, "step execution = ");
113122
}
114123

124+
/**
125+
* Test for GitHub issue #4943: MongoDB abrupt shutdown restart problem.
126+
*
127+
* <p>
128+
* This test simulates the scenario where a job is abruptly terminated, the status is
129+
* manually changed to FAILED, but the JobExecution's embedded stepExecutions array
130+
* remains empty. The fixed implementation should still be able to find and restart
131+
* the job by querying the BATCH_STEP_EXECUTION collection directly.
132+
*
133+
* @see <a href="https://github.com/spring-projects/spring-batch/issues/4943">GitHub
134+
* Issue #4943</a>
135+
*/
136+
@Test
137+
void testAbruptShutdownRestart(@Autowired JobOperator jobOperator, @Autowired Job job,
138+
@Autowired JobRepository jobRepository, @Autowired StepExecutionDao stepExecutionDao) throws Exception {
139+
// Step 1: Start a job and let it complete normally first
140+
JobParameters jobParameters = new JobParametersBuilder().addString("name", "abruptShutdownTest")
141+
.addLocalDateTime("runtime", LocalDateTime.now())
142+
.toJobParameters();
143+
144+
JobExecution originalJobExecution = jobOperator.start(job, jobParameters);
145+
JobInstance jobInstance = originalJobExecution.getJobInstance();
146+
147+
// Wait for job to complete (JobOperator.start is synchronous)
148+
149+
// Step 2: *** SIMULATE ABRUPT SHUTDOWN ***
150+
// 1. Job status is manually changed to FAILED
151+
// 2. StepExecutions remain in BATCH_STEP_EXECUTION collection
152+
// 3. BUT embedded stepExecutions array in BATCH_JOB_EXECUTION becomes empty
153+
Query jobQuery = new Query(Criteria.where("jobExecutionId").is(originalJobExecution.getId()));
154+
Update jobUpdate = new Update().set("status", BatchStatus.FAILED)
155+
.set("exitStatus.exitCode", "FAILED")
156+
.set("exitStatus.exitDescription", "Simulated abrupt shutdown")
157+
.set("endTime", LocalDateTime.now())
158+
.set("stepExecutions", Collections.emptyList()); // ← This is the key issue!
159+
160+
mongoTemplate.updateFirst(jobQuery, jobUpdate, "BATCH_JOB_EXECUTION");
161+
162+
// Update step executions status to FAILED (but keep them in BATCH_STEP_EXECUTION
163+
// collection)
164+
Query stepQuery = new Query(Criteria.where("jobExecutionId").is(originalJobExecution.getId()));
165+
Update stepUpdate = new Update().set("status", BatchStatus.FAILED)
166+
.set("exitStatus.exitCode", "FAILED")
167+
.set("endTime", LocalDateTime.now());
168+
169+
mongoTemplate.updateMulti(stepQuery, stepUpdate, "BATCH_STEP_EXECUTION");
170+
171+
// Step 3: Test the fixed getLastStepExecution method
172+
// This is where the bug would occur with the old implementation
173+
StepExecution lastStepExecution = stepExecutionDao.getLastStepExecution(jobInstance, "step1");
174+
Assertions.assertNotNull(lastStepExecution,
175+
"getLastStepExecution should find step execution even after abrupt shutdown");
176+
Assertions.assertEquals("step1", lastStepExecution.getStepName());
177+
Assertions.assertEquals(BatchStatus.FAILED, lastStepExecution.getStatus());
178+
179+
// Step 4: Test the fixed countStepExecutions method
180+
long stepCount = stepExecutionDao.countStepExecutions(jobInstance, "step1");
181+
Assertions.assertEquals(1L, stepCount,
182+
"countStepExecutions should count from BATCH_STEP_EXECUTION collection directly");
183+
184+
// Step 5: Verify the abrupt shutdown simulation worked
185+
MongoCollection<Document> jobExecutionsCollection = mongoTemplate.getCollection("BATCH_JOB_EXECUTION");
186+
MongoCollection<Document> stepExecutionsCollection = mongoTemplate.getCollection("BATCH_STEP_EXECUTION");
187+
188+
// Check that job execution has empty stepExecutions (simulating abrupt shutdown)
189+
Document jobDoc = jobExecutionsCollection.find(new Document("jobExecutionId", originalJobExecution.getId()))
190+
.first();
191+
Assertions.assertNotNull(jobDoc);
192+
Assertions.assertTrue(jobDoc.getList("stepExecutions", Document.class).isEmpty(),
193+
"JobExecution should have empty stepExecutions array (simulating abrupt shutdown)");
194+
195+
// But step executions should still exist in separate collection
196+
Assertions.assertEquals(2, stepExecutionsCollection.countDocuments(),
197+
"BATCH_STEP_EXECUTION collection should still contain step execution data");
198+
199+
// Step 6: Test restart capability (this would fail with the old bug)
200+
JobExecution reloadedJobExecution = jobRepository.getJobExecution(originalJobExecution.getId());
201+
Assertions.assertNotNull(reloadedJobExecution, "JobExecution should be found in repository");
202+
203+
// The restart should work now with the fix (synchronous operation)
204+
JobExecution restartedJobExecution = jobOperator.restart(reloadedJobExecution);
205+
206+
// Verify restart was successful
207+
Assertions.assertNotNull(restartedJobExecution);
208+
Assertions.assertEquals(ExitStatus.COMPLETED, restartedJobExecution.getExitStatus());
209+
Assertions.assertEquals(BatchStatus.COMPLETED, restartedJobExecution.getStatus());
210+
}
211+
115212
private static void dump(MongoCollection<Document> collection, String prefix) {
116213
for (Document document : collection.find()) {
117214
System.out.println(prefix + document.toJson());

0 commit comments

Comments
 (0)