spring-projects · ilayaperumalg · Dec 2, 2025
diff --git a/...-commons/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java b/...-commons/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java
@@ -108,13 +108,18 @@ protected List<String> doSplit(String text, int chunkSize) {
 				continue;
 			}
 
-			// Find the last period or punctuation mark in the chunk
-			int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
-					Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
-
-			if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
-				// Truncate the chunk text at the punctuation mark
-				chunkText = chunkText.substring(0, lastPunctuation + 1);
+			// Only apply punctuation-based truncation if we have more tokens than the
+			// chunk size
+			// This prevents unnecessary splitting of small texts
+			if (tokens.size() > chunkSize) {
+				// Find the last period or punctuation mark in the chunk
+				int lastPunctuation = Math.max(chunkText.lastIndexOf('.'), Math.max(chunkText.lastIndexOf('?'),
+						Math.max(chunkText.lastIndexOf('!'), chunkText.lastIndexOf('\n'))));
+
+				if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
+					// Truncate the chunk text at the punctuation mark
+					chunkText = chunkText.substring(0, lastPunctuation + 1);
+				}
 			}
 
 			String chunkTextToAppend = (this.keepSeparator) ? chunkText.trim()

diff --git a/...mons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/...mons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
@@ -78,7 +78,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		doc1.setContentFormatter(contentFormatter1);
 
 		var doc2 = new Document("The most oppressive thing about the labyrinth is that you are constantly "
-				+ "being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
+				+ "being forced to choose. It isn't the lack of an exit, but the abundance of exits that is so disorienting.",
 				Map.of("key2", "value22", "key3", "value3"));
 		doc2.setContentFormatter(contentFormatter2);
 
@@ -101,7 +101,7 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		// Doc 2
 		assertThat(chunks.get(2).getText()).isEqualTo("The most oppressive thing about the labyrinth is that you");
 		assertThat(chunks.get(3).getText()).isEqualTo("are constantly being forced to choose.");
-		assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
+		assertThat(chunks.get(4).getText()).isEqualTo("It isn't the lack of an exit, but");
 		assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
 
 		// Verify that the original metadata is copied to all chunks (including
@@ -125,4 +125,44 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
 	}
 
+	@Test
+	public void testSmallTextWithPunctuationShouldNotSplit() {
+		TokenTextSplitter splitter = TokenTextSplitter.builder()
+			.withKeepSeparator(true)
+			.withChunkSize(10000)
+			.withMinChunkSizeChars(10)
+			.build();
+
+		Document testDoc = new Document(
+				"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
+		List<Document> splitted = splitter.split(testDoc);
+
+		// Should be a single chunk since the text is well below the chunk size
+		assertThat(splitted.size()).isEqualTo(1);
+		assertThat(splitted.get(0).getText()).isEqualTo(
+				"Hi. This is a small text without one of the ending chars. It is splitted into multiple chunks but shouldn't");
+	}
+
+	@Test
+	public void testLargeTextStillSplitsAtPunctuation() {
+		// Verify that punctuation-based splitting still works when text exceeds chunk
+		// size
+		TokenTextSplitter splitter = TokenTextSplitter.builder()
+			.withKeepSeparator(true)
+			.withChunkSize(15)
+			.withMinChunkSizeChars(10)
+			.build();
+
+		// This text has multiple sentences and will exceed 15 tokens
+		Document testDoc = new Document(
+				"This is the first sentence with enough words. This is the second sentence. And this is the third sentence.");
+		List<Document> splitted = splitter.split(testDoc);
+
+		// Should split into multiple chunks at punctuation marks
+		assertThat(splitted.size()).isGreaterThan(1);
+
+		// Verify first chunk ends with punctuation
+		assertThat(splitted.get(0).getText()).endsWith(".");
+	}
+
 }