linkedin · satishkotha · Dec 5, 2025 · Nov 19, 2025 · Nov 25, 2025 · Nov 26, 2025
diff --git a/ambry-router/src/main/java/com/github/ambry/router/PutManager.java b/ambry-router/src/main/java/com/github/ambry/router/PutManager.java
@@ -15,7 +15,6 @@
 
 import com.github.ambry.account.Account;
 import com.github.ambry.account.AccountService;
-import com.github.ambry.account.Container;
 import com.github.ambry.clustermap.ClusterMap;
 import com.github.ambry.clustermap.ClusterMapUtils;
 import com.github.ambry.commons.ByteBufferAsyncWritableChannel;

diff --git a/ambry-router/src/main/java/com/github/ambry/router/PutOperation.java b/ambry-router/src/main/java/com/github/ambry/router/PutOperation.java
@@ -57,6 +57,7 @@
 import io.netty.buffer.Unpooled;
 import io.netty.util.ReferenceCountUtil;
 import java.nio.ByteBuffer;
+import java.nio.channels.ClosedChannelException;
 import java.security.GeneralSecurityException;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -574,10 +575,23 @@ void setOperationCompleted() {
 
   /**
    * Clean up the chunks to release any data buffer. This should be invoked when terminating the operation with
-   * an exception.
+   * an exception. This method also closes the chunkFillerChannel to fire any pending callbacks, ensuring the original
+   * buffer from the ReadableStreamChannel is properly released. Synchronized for memory visibility on channelReadBuf.
+   * The contract upheld by PutManager is that this method is called AT-MOST-ONCE.
    */
-  public void cleanupChunks() {
-    releaseDataForAllChunks();
+  public synchronized void cleanupChunks() {
+    try {
+      releaseDataForAllChunks();
+    } finally {
+      // Release the extra reference we retained when storing in channelReadBuf.
+      if (channelReadBuf != null) {
+        channelReadBuf.release();
+        channelReadBuf = null;
+      }
+      // Close the chunkFillerChannel to fire any remaining callbacks in chunksAwaitingResolution.
+      // This ensures the original buffer (owned by the callback) is released and not leaked.
+      chunkFillerChannel.close();
+    }
   }
 
   /**
@@ -676,13 +690,18 @@ boolean isChunkFillingDone() {
    * chunkFillerChannel, if there is any.
    * @throws InterruptedException if the call to get a chunk from the chunkFillerChannel is interrupted.
    */
-  void fillChunks() {
+  synchronized void fillChunks() {
     try {
       PutChunk chunkToFill;
       while (!isChunkFillingDone()) {
         // Attempt to fill a chunk
         if (channelReadBuf == null) {
           channelReadBuf = chunkFillerChannel.getNextByteBuf(0);
+          if (channelReadBuf != null) {
+            // Retain the buffer to protect against the channel callback releasing it
+            // while we still hold a reference we're processing.
+            channelReadBuf.retain();
+          }
         }
         if (channelReadBuf != null) {
           if (channelReadBuf.readableBytes() > 0 && isChunkAwaitingResolution()) {
@@ -707,8 +726,13 @@ void fillChunks() {
               routerCallback.onPollReady();
             }
             if (!channelReadBuf.isReadable()) {
-              chunkFillerChannel.resolveOldestChunk(null);
-              channelReadBuf = null;
+              try {
+                chunkFillerChannel.resolveOldestChunk(null);
+              } finally {
+                // Release the reference we retained when storing getNextByteBuf, even if resolveOldestChunk throws.
+                channelReadBuf.release();
+                channelReadBuf = null;
+              }
             }
           }
         } else {
@@ -1096,16 +1120,19 @@ boolean isStitchOperation() {
   }
 
   /**
-   * Set the exception associated with this operation.
-   * First, if current operationException is null, directly set operationException as exception;
-   * Second, if operationException exists, compare ErrorCodes of exception and existing operation Exception depending
-   * on precedence level. An ErrorCode with a smaller precedence level overrides an ErrorCode with a larger precedence
-   * level. Update the operationException if necessary.
-   * @param exception the {@link RouterException} to possibly set.
+   * Set the exception associated with this operation and mark it complete.
+   * For {@link RouterException}: uses precedence-based replacement where lower precedence
+   * levels override higher ones.
+   * For {@link java.nio.channels.ClosedChannelException}: only set if no other exception has
+   * been set to avoid overwriting meaningful errors.
+   * For all others simply set the exception as we don't know what they are or how to classify them.
+   * @param exception the {@link Exception} to possibly set.
    */
   void setOperationExceptionAndComplete(Exception exception) {
     if (exception instanceof RouterException) {
       RouterUtils.replaceOperationException(operationException, (RouterException) exception, this::getPrecedenceLevel);
+    } else if (exception instanceof ClosedChannelException) {
+      operationException.compareAndSet(null, exception);
     } else {
       operationException.set(exception);
     }
@@ -1642,7 +1669,7 @@ void onFillComplete(boolean updateMetric) {
      * @param channelReadBuf the {@link ByteBuf} from which to read data.
      * @return the number of bytes transferred in this operation.
      */
-    synchronized int fillFrom(ByteBuf channelReadBuf) {
+    int fillFrom(ByteBuf channelReadBuf) {
       int toWrite;
       ByteBuf slice;
       if (buf == null) {

diff --git a/ambry-router/src/test/java/com/github/ambry/router/NonBlockingRouterTest.java b/ambry-router/src/test/java/com/github/ambry/router/NonBlockingRouterTest.java
@@ -14,6 +14,7 @@
 package com.github.ambry.router;
 
 import com.codahale.metrics.MetricRegistry;
+import com.github.ambry.utils.NettyByteBufLeakHelper;
 import com.github.ambry.account.Account;
 import com.github.ambry.account.Container;
 import com.github.ambry.clustermap.DataNodeId;
@@ -22,6 +23,7 @@
 import com.github.ambry.clustermap.PartitionId;
 import com.github.ambry.clustermap.ReplicaId;
 import com.github.ambry.commons.BlobId;
+import com.github.ambry.commons.ByteBufReadableStreamChannel;
 import com.github.ambry.commons.ByteBufferReadableStreamChannel;
 import com.github.ambry.commons.Callback;
 import com.github.ambry.commons.LoggingNotificationSystem;
@@ -37,7 +39,6 @@
 import com.github.ambry.frontend.Operations;
 import com.github.ambry.messageformat.BlobProperties;
 import com.github.ambry.messageformat.MessageFormatRecord;
-import com.github.ambry.named.NamedBlobRecord;
 import com.github.ambry.network.NetworkClient;
 import com.github.ambry.network.NetworkClientErrorCode;
 import com.github.ambry.network.NetworkClientFactory;
@@ -97,6 +98,8 @@
 import java.util.stream.Collectors;
 import java.util.stream.LongStream;
 import javax.sql.DataSource;
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.PooledByteBufAllocator;
 import org.json.JSONObject;
 import org.junit.AfterClass;
 import org.junit.Assert;
@@ -4550,4 +4553,52 @@ static void verifyRepairRequestRecordInDb(MysqlRepairRequestsDb db, BlobId blobI
       assertEquals(expectedRecord.getExpirationTimeMs(), record.getExpirationTimeMs());
     }
   }
+
+  /**
+   * Test for bytebuf memory leaks in PutOperation when operations are aborted in the middle of a put operation.
+   * This test verifies that PutOperation properly releases bytebuf when the operation completes/fails, even if
+   * the ChunkFiller thread hasn't processed some data yet.
+   */
+  @Test
+  public void testPutOperationByteBufLeakOnAbort() throws Exception {
+    NettyByteBufLeakHelper testLeakHelper = new NettyByteBufLeakHelper();
+    testLeakHelper.beforeTest();
+
+    Properties props = getNonBlockingRouterProperties(localDcName);
+    int chunkSize = 512;
+    props.setProperty("router.max.put.chunk.size.bytes", Integer.toString(chunkSize));
+    setRouter(props, mockServerLayout, new LoggingNotificationSystem());
+
+    // Configure servers to succeed for first few chunks, then fail
+    List<ServerErrorCode> serverErrorList = new ArrayList<>();
+    serverErrorList.add(ServerErrorCode.NoError);
+    serverErrorList.add(ServerErrorCode.NoError);
+    for (int i = 0; i < 100; i++) {
+      serverErrorList.add(ServerErrorCode.PartitionReadOnly);
+    }
+    mockServerLayout.getMockServers().forEach(server -> server.setServerErrors(serverErrorList));
+
+    // The first two will run normally, but 3+ will get ServerErrorCode.PartitionReadOnly
+    int blobSize = 100 * chunkSize;
+    byte[] blobData = new byte[blobSize];
+    ThreadLocalRandom.current().nextBytes(blobData);
+    ByteBuf pooledBuf = PooledByteBufAllocator.DEFAULT.buffer(blobSize);
+    pooledBuf.writeBytes(blobData);
+    ByteBufReadableStreamChannel channel = new ByteBufReadableStreamChannel(pooledBuf);
+
+    BlobProperties blobProperties = new BlobProperties(blobSize, "serviceId", "ownerId", "contentType",
+        false, Utils.Infinite_Time, Utils.getRandomShort(ThreadLocalRandom.current()),
+        Utils.getRandomShort(ThreadLocalRandom.current()), false, null, null, null);
+
+    try {
+      router.putBlob(blobProperties, new byte[10], channel, PutBlobOptions.DEFAULT).get();
+    } catch (ExecutionException e) {
+      // Expected for operations that hit error responses
+    }
+    // If there are leaks, it will be detected in NettyByteBufLeakHelper and fail the test.
+    // Should be called before router close as closing of the router shouldn't be required to prevent leaks.
+    testLeakHelper.afterTest();
+    router.close();
+    router = null;
+  }
 }