-
Notifications
You must be signed in to change notification settings - Fork 286
Fix ByteBuf memory leak in PutOperation when operations are aborted #3176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,7 @@ | |
| import io.netty.buffer.Unpooled; | ||
| import io.netty.util.ReferenceCountUtil; | ||
| import java.nio.ByteBuffer; | ||
| import java.nio.channels.ClosedChannelException; | ||
| import java.security.GeneralSecurityException; | ||
| import java.util.ArrayList; | ||
| import java.util.Collections; | ||
|
|
@@ -574,10 +575,23 @@ void setOperationCompleted() { | |
|
|
||
| /** | ||
| * Clean up the chunks to release any data buffer. This should be invoked when terminating the operation with | ||
| * an exception. | ||
| * an exception. This method also closes the chunkFillerChannel to fire any pending callbacks, ensuring the original | ||
| * buffer from the ReadableStreamChannel is properly released. Synchronized for memory visibility on channelReadBuf. | ||
| * The contract upheld by PutManager is that this method is called AT-MOST-ONCE. | ||
| */ | ||
| public void cleanupChunks() { | ||
| releaseDataForAllChunks(); | ||
| public synchronized void cleanupChunks() { | ||
| try { | ||
| releaseDataForAllChunks(); | ||
| } finally { | ||
| // Release the extra reference we retained when storing in channelReadBuf. | ||
| if (channelReadBuf != null) { | ||
| channelReadBuf.release(); | ||
| channelReadBuf = null; | ||
| } | ||
| // Close the chunkFillerChannel to fire any remaining callbacks in chunksAwaitingResolution. | ||
| // This ensures the original buffer (owned by the callback) is released and not leaked. | ||
| chunkFillerChannel.close(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -676,13 +690,18 @@ boolean isChunkFillingDone() { | |
| * chunkFillerChannel, if there is any. | ||
| * @throws InterruptedException if the call to get a chunk from the chunkFillerChannel is interrupted. | ||
| */ | ||
| void fillChunks() { | ||
| synchronized void fillChunks() { | ||
| try { | ||
| PutChunk chunkToFill; | ||
| while (!isChunkFillingDone()) { | ||
| // Attempt to fill a chunk | ||
| if (channelReadBuf == null) { | ||
| channelReadBuf = chunkFillerChannel.getNextByteBuf(0); | ||
| if (channelReadBuf != null) { | ||
| // Retain the buffer to protect against the channel callback releasing it | ||
| // while we still hold a reference we're processing. | ||
| channelReadBuf.retain(); | ||
| } | ||
| } | ||
| if (channelReadBuf != null) { | ||
| if (channelReadBuf.readableBytes() > 0 && isChunkAwaitingResolution()) { | ||
|
|
@@ -707,8 +726,13 @@ void fillChunks() { | |
| routerCallback.onPollReady(); | ||
| } | ||
| if (!channelReadBuf.isReadable()) { | ||
| chunkFillerChannel.resolveOldestChunk(null); | ||
| channelReadBuf = null; | ||
| try { | ||
| chunkFillerChannel.resolveOldestChunk(null); | ||
| } finally { | ||
| // Release the reference we retained when storing getNextByteBuf, even if resolveOldestChunk throws. | ||
| channelReadBuf.release(); | ||
| channelReadBuf = null; | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
|
|
@@ -1096,16 +1120,19 @@ boolean isStitchOperation() { | |
| } | ||
|
|
||
| /** | ||
| * Set the exception associated with this operation. | ||
| * First, if current operationException is null, directly set operationException as exception; | ||
| * Second, if operationException exists, compare ErrorCodes of exception and existing operation Exception depending | ||
| * on precedence level. An ErrorCode with a smaller precedence level overrides an ErrorCode with a larger precedence | ||
| * level. Update the operationException if necessary. | ||
| * @param exception the {@link RouterException} to possibly set. | ||
| * Set the exception associated with this operation and mark it complete. | ||
| * For {@link RouterException}: uses precedence-based replacement where lower precedence | ||
| * levels override higher ones. | ||
| * For {@link java.nio.channels.ClosedChannelException}: only set if no other exception has | ||
| * been set to avoid overwriting meaningful errors. | ||
| * For all others simply set the exception as we don't know what they are or how to classify them. | ||
| * @param exception the {@link Exception} to possibly set. | ||
| */ | ||
| void setOperationExceptionAndComplete(Exception exception) { | ||
| if (exception instanceof RouterException) { | ||
| RouterUtils.replaceOperationException(operationException, (RouterException) exception, this::getPrecedenceLevel); | ||
| } else if (exception instanceof ClosedChannelException) { | ||
| operationException.compareAndSet(null, exception); | ||
| } else { | ||
| operationException.set(exception); | ||
| } | ||
|
|
@@ -1642,7 +1669,7 @@ void onFillComplete(boolean updateMetric) { | |
| * @param channelReadBuf the {@link ByteBuf} from which to read data. | ||
| * @return the number of bytes transferred in this operation. | ||
| */ | ||
| synchronized int fillFrom(ByteBuf channelReadBuf) { | ||
| int fillFrom(ByteBuf channelReadBuf) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we still need this function to be synchronized here, it's here to protect race condition.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lets walk through to see if that's the case.
So fillChunks since it's only accessed by a single thread would only needs to be synchronized from concurrent access from error / cleanup threads. What is needed is that any objects used within the fillChunks routine which may also be concurrently accessed by those threads to be either behind a more narrowly scoped lock or declared as volatile. So lets look at that. In PutManager.poll we have: So
Therefore we need to a) make sure anything that happens within In PutManager.completePendingOperations we have: and So lets look at condition a: So for condition A we set the exception, set operation completed, and clear chunks which are provably finished. None of this will involve concurrent modification with fillChunks. Lets looks at condition b: For condition b we can either add synchronized to fillChunks (instead of fillFrom) or we can add a lock around updating the operationCompleted value (and when we need to avoid TOCTOU). The synchronized on fillChunks should cause the least amount of complexity without too large an of an overhead as most of the work in fillChunks happens within an internal loop depending on the operationCompleted value.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see you added |
||
| int toWrite; | ||
| ByteBuf slice; | ||
| if (buf == null) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two cases we need to close the chunkFillerChannel to clean up the buffered chunks.
In both cases, an exception would be send back to the
NettyResponseChannelto closeNettyRequest. WhenNettyRequestis closed, it would call the callback method we passed to intreadIntomethod, which would close the chunk filler channel.ByteBufferAsyncWritableChannelhas an atomic boolean variable to make sure the channel is closed only once, so callingclosetwice won't do any harm. But it's also not necessary.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So the order matters here.
Ergo, only where the chunkFillerChannel is closed should channelReadBuf be released.
If what you assert is true, that readInto callback is always called, we should just move all of
cleanupChunks()logic into the readInto callback and deletecleanupChunksoutright. That would fully delegate responsibility and remove this confusion over who's job is it to do what.If the readInto callback isn't always called, then we need to close it here when channelReadBuf is released. Let me know which you'd prefer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving
cleanupChunksto readnto callback is not a bad idea. I would prefer it that way. This way, we can make it a private method and don't have to call it fromPutManager. It's probably better in this way.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did more work on this and existing test suite makes it very clear: the system expects that the readInto callback doesn't expect chunks to be cleaned up. I think the invariant we originally had here is correct:
readIntodoesn't have a guarantee that it runs after the PutOperation is done with the chunks, so it can't clean up the chunks. Ergo, I the current PR commit is the right implementation.