From 23f3df92c4f7c2d166620a9e55a8c5c0f6cf5aef Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Tue, 26 Aug 2025 20:45:05 +0000
Subject: [PATCH 1/3] Use cuda filters to support 10-bit videos

For: #776

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/CudaDeviceInterface.cpp | 62 ++++++++++++++++++++
 src/torchcodec/_core/CudaDeviceInterface.h   |  5 ++
 src/torchcodec/_core/DeviceInterface.h       | 13 ++++
 src/torchcodec/_core/FilterGraph.cpp         |  6 +-
 src/torchcodec/_core/FilterGraph.h           |  3 +-
 src/torchcodec/_core/SingleStreamDecoder.cpp | 36 ++++++++++++
 src/torchcodec/_core/SingleStreamDecoder.h   |  4 ++
 test/test_decoders.py                        | 23 ++------
 8 files changed, 130 insertions(+), 22 deletions(-)
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 74b556ed0..375a9bc20 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -199,6 +199,68 @@ void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
   return;
 }
 
+std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame,
+    const AVRational& timeBase) {
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(avFrame->format);
+
+  if (avFrame->format != AV_PIX_FMT_CUDA) {
+    auto cpuDevice = torch::Device(torch::kCPU);
+    auto cpuInterface = createDeviceInterface(cpuDevice);
+    return cpuInterface->initializeFiltersContext(
+        videoStreamOptions, avFrame, timeBase);
+  }
+
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
+  int height = frameDims.height;
+  int width = frameDims.width;
+
+  auto hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
+  AVPixelFormat actualFormat = hwFramesCtx->sw_format;
+
+  if (actualFormat == AV_PIX_FMT_NV12) {
+    return nullptr;
+  }
+
+  AVPixelFormat outputFormat;
+  std::stringstream filters;
+
+  unsigned version_int = avfilter_version();
+  if (version_int < AV_VERSION_INT(8, 0, 103)) {
+    // Color conversion support ('format=' option) was added to scale_cuda from
+    // n5.0. With the earlier version of ffmpeg we have no choice but use CPU
+    // filters. See:
+    // https://github.com/FFmpeg/FFmpeg/commit/62dc5df941f5e196164c151691e4274195523e95
+    outputFormat = AV_PIX_FMT_RGB24;
+
+    filters << "hwdownload,format=" << av_pix_fmt_desc_get(actualFormat)->name;
+    filters << ",scale=" << width << ":" << height;
+    filters << ":sws_flags=bilinear";
+  } else {
+    // Actual output color format will be set via filter options
+    outputFormat = AV_PIX_FMT_CUDA;
+
+    filters << "scale_cuda=" << width << ":" << height;
+    filters << ":format=nv12:interp_algo=bilinear";
+  }
+
+  return std::make_unique<FiltersContext>(
+      avFrame->width,
+      avFrame->height,
+      frameFormat,
+      avFrame->sample_aspect_ratio,
+      width,
+      height,
+      outputFormat,
+      filters.str(),
+      timeBase,
+      av_buffer_ref(avFrame->hw_frames_ctx));
+}
+
 void CudaDeviceInterface::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
     [[maybe_unused]] const AVRational& timeBase,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index f29caff42..b49908443 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -21,6 +21,11 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void initializeContext(AVCodecContext* codecContext) override;
 
+  std::unique_ptr<FiltersContext> initializeFiltersContext(
+      const VideoStreamOptions& videoStreamOptions,
+      const UniqueAVFrame& avFrame,
+      const AVRational& timeBase) override;
+
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index 9a7288eb0..7916c81b2 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -12,6 +12,7 @@
 #include <stdexcept>
 #include <string>
 #include "FFMPEGCommon.h"
+#include "src/torchcodec/_core/FilterGraph.h"
 #include "src/torchcodec/_core/Frame.h"
 #include "src/torchcodec/_core/StreamOptions.h"
 
@@ -33,6 +34,18 @@ class DeviceInterface {
   // support CUDA and others only support CPU.
   virtual void initializeContext(AVCodecContext* codecContext) = 0;
 
+  // Returns FilterContext if device interface can't handle conversion of the
+  // frame on its own within a call to convertAVFrameToFrameOutput().
+  // FilterContext contains input and output initialization parameters
+  // describing required conversion. Output can further be passed to
+  // convertAVFrameToFrameOutput() to generate output tensor.
+  virtual std::unique_ptr<FiltersContext> initializeFiltersContext(
+      [[maybe_unused]] const VideoStreamOptions& videoStreamOptions,
+      [[maybe_unused]] const UniqueAVFrame& avFrame,
+      [[maybe_unused]] const AVRational& timeBase) {
+    return nullptr;
+  };
+
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
index 43a12f092..c22875915 100644
--- a/src/torchcodec/_core/FilterGraph.cpp
+++ b/src/torchcodec/_core/FilterGraph.cpp
@@ -22,7 +22,8 @@ FiltersContext::FiltersContext(
     int outputHeight,
     AVPixelFormat outputFormat,
     const std::string& filtergraphStr,
-    AVRational timeBase)
+    AVRational timeBase,
+    AVBufferRef* hwFramesCtx)
     : inputWidth(inputWidth),
       inputHeight(inputHeight),
       inputFormat(inputFormat),
@@ -31,7 +32,8 @@ FiltersContext::FiltersContext(
       outputHeight(outputHeight),
       outputFormat(outputFormat),
       filtergraphStr(filtergraphStr),
-      timeBase(timeBase) {}
+      timeBase(timeBase),
+      hwFramesCtx(hwFramesCtx) {}
 
 bool operator==(const AVRational& lhs, const AVRational& rhs) {
   return lhs.num == rhs.num && lhs.den == rhs.den;
diff --git a/src/torchcodec/_core/FilterGraph.h b/src/torchcodec/_core/FilterGraph.h
index 4edff6c1b..8cba571bd 100644
--- a/src/torchcodec/_core/FilterGraph.h
+++ b/src/torchcodec/_core/FilterGraph.h
@@ -35,7 +35,8 @@ struct FiltersContext {
       int outputHeight,
       AVPixelFormat outputFormat,
       const std::string& filtergraphStr,
-      AVRational timeBase);
+      AVRational timeBase,
+      AVBufferRef* hwFramesCtx = nullptr);
 
   bool operator==(const FiltersContext&) const;
   bool operator!=(const FiltersContext&) const;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 864e82d0a..01d9f3cf9 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1251,6 +1251,42 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
         deviceInterface_ != nullptr,
         "No device interface available for video decoding. This ",
         "shouldn't happen, please report.");
+
+    std::unique_ptr<FiltersContext> newFiltersContext =
+        deviceInterface_->initializeFiltersContext(
+            streamInfo.videoStreamOptions, avFrame, streamInfo.timeBase);
+    // Device interface might return nullptr for the filter context in which
+    // case device interface will handle conversion directly in
+    // convertAVFrameToFrameOutput().
+    if (newFiltersContext) {
+      // We need to compare the current filter context with our previous filter
+      // context. If they are different, then we need to re-create a filter
+      // graph. We create a filter graph late so that we don't have to depend
+      // on the unreliable metadata in the header. And we sometimes re-create
+      // it because it's possible for frame resolution to change mid-stream.
+      // Finally, we want to reuse the filter graph as much as possible for
+      // performance reasons.
+      if (!filterGraph_ || filtersContext_ != newFiltersContext) {
+        filterGraph_ = std::make_unique<FilterGraph>(
+            *newFiltersContext, streamInfo.videoStreamOptions);
+        filtersContext_ = std::move(newFiltersContext);
+      }
+      avFrame = filterGraph_->convert(avFrame);
+
+      // If this check fails it means the frame wasn't
+      // reshaped to its expected dimensions by filtergraph.
+      TORCH_CHECK(
+          (avFrame->width == filtersContext_->outputWidth) &&
+              (avFrame->height == filtersContext_->outputHeight),
+          "Expected frame from filter graph of ",
+          filtersContext_->outputWidth,
+          "x",
+          filtersContext_->outputHeight,
+          ", got ",
+          avFrame->width,
+          "x",
+          avFrame->height);
+    }
     deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
         streamInfo.timeBase,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 027f52fc4..98fbf6cf8 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -351,6 +351,10 @@ class SingleStreamDecoder {
   SeekMode seekMode_;
   ContainerMetadata containerMetadata_;
   UniqueDecodingAVFormatContext formatContext_;
+  // Current filter context. Used to know whether a new FilterGraph
+  // should be created to process a next frame.
+  std::unique_ptr<FiltersContext> filtersContext_;
+  std::unique_ptr<FilterGraph> filterGraph_;
   std::unique_ptr<DeviceInterface> deviceInterface_;
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
diff --git a/test/test_decoders.py b/test/test_decoders.py
index 72b586891..d0525b741 100644
--- a/test/test_decoders.py
+++ b/test/test_decoders.py
@@ -1225,22 +1225,6 @@ def test_full_and_studio_range_bt709_video(self, asset):
             elif cuda_version_used_for_building_torch() == (12, 8):
                 assert psnr(gpu_frame, cpu_frame) > 20
 
-    @needs_cuda
-    def test_10bit_videos_cuda(self):
-        # Assert that we raise proper error on different kinds of 10bit videos.
-
-        # TODO we should investigate how to support 10bit videos on GPU.
-        # See https://github.com/pytorch/torchcodec/issues/776
-
-        asset = H265_10BITS
-
-        decoder = VideoDecoder(asset.path, device="cuda")
-        with pytest.raises(
-            RuntimeError,
-            match="The AVFrame is p010le, but we expected AV_PIX_FMT_NV12.",
-        ):
-            decoder.get_frame_at(0)
-
     @needs_cuda
     def test_10bit_gpu_fallsback_to_cpu(self):
         # Test for 10-bit videos that aren't supported by NVDEC: we decode and
@@ -1272,12 +1256,13 @@ def test_10bit_gpu_fallsback_to_cpu(self):
         frames_cpu = decoder_cpu.get_frames_at(frame_indices).data
         assert_frames_equal(frames_gpu.cpu(), frames_cpu)
 
+    @pytest.mark.parametrize("device", all_supported_devices())
     @pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
-    def test_10bit_videos_cpu(self, asset):
-        # This just validates that we can decode 10-bit videos on CPU.
+    def test_10bit_videos(self, device, asset):
+        # This just validates that we can decode 10-bit videos.
         # TODO validate against the ref that the decoded frames are correct
 
-        decoder = VideoDecoder(asset.path)
+        decoder = VideoDecoder(asset.path, device=device)
         decoder.get_frame_at(10)
 
     def setup_frame_mappings(tmp_path, file, stream_index):

From 745ed48df9cc8bb1c3de55df9459fbb35efa2d8c Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Thu, 28 Aug 2025 20:51:16 +0000
Subject: [PATCH 2/3] Implement initializeFiltersContext for CPU device
 interface

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/CpuDeviceInterface.cpp | 147 ++++++++++----------
 src/torchcodec/_core/CpuDeviceInterface.h   |  15 +-
 2 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 77eaf3d09..1eaa3be15 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -13,6 +13,34 @@ static bool g_cpu = registerDeviceInterface(
     torch::kCPU,
     [](const torch::Device& device) { return new CpuDeviceInterface(device); });
 
+ColorConversionLibrary getColorConversionLibrary(
+    const VideoStreamOptions& videoStreamOptions,
+    int width) {
+  // By default, we want to use swscale for color conversion because it is
+  // faster. However, it has width requirements, so we may need to fall back
+  // to filtergraph. We also need to respect what was requested from the
+  // options; we respect the options unconditionally, so it's possible for
+  // swscale's width requirements to be violated. We don't expose the ability to
+  // choose color conversion library publicly; we only use this ability
+  // internally.
+
+  // swscale requires widths to be multiples of 32:
+  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
+  // so we fall back to filtergraph if the width is not a multiple of 32.
+  auto defaultLibrary = (width % 32 == 0) ? ColorConversionLibrary::SWSCALE
+                                          : ColorConversionLibrary::FILTERGRAPH;
+
+  ColorConversionLibrary colorConversionLibrary =
+      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
+
+  TORCH_CHECK(
+      colorConversionLibrary == ColorConversionLibrary::SWSCALE ||
+          colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH,
+      "Invalid color conversion library: ",
+      static_cast<int>(colorConversionLibrary));
+  return colorConversionLibrary;
+}
+
 } // namespace
 
 CpuDeviceInterface::SwsFrameContext::SwsFrameContext(
@@ -46,6 +74,38 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
 }
 
+std::unique_ptr<FiltersContext> CpuDeviceInterface::initializeFiltersContext(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame,
+    const AVRational& timeBase) {
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(avFrame->format);
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
+  int expectedOutputHeight = frameDims.height;
+  int expectedOutputWidth = frameDims.width;
+
+  if (getColorConversionLibrary(videoStreamOptions, expectedOutputWidth) ==
+      ColorConversionLibrary::SWSCALE) {
+    return nullptr;
+  }
+
+  std::stringstream filters;
+  filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
+  filters << ":sws_flags=bilinear";
+
+  return std::make_unique<FiltersContext>(
+      avFrame->width,
+      avFrame->height,
+      frameFormat,
+      avFrame->sample_aspect_ratio,
+      expectedOutputWidth,
+      expectedOutputHeight,
+      AV_PIX_FMT_RGB24,
+      filters.str(),
+      timeBase);
+}
+
 // Note [preAllocatedOutputTensor with swscale and filtergraph]:
 // Callers may pass a pre-allocated tensor, where the output.data tensor will
 // be stored. This parameter is honored in any case, but it only leads to a
@@ -57,7 +117,7 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void CpuDeviceInterface::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
-    const AVRational& timeBase,
+    [[maybe_unused]] const AVRational& timeBase,
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -83,23 +143,8 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
 
-  // By default, we want to use swscale for color conversion because it is
-  // faster. However, it has width requirements, so we may need to fall back
-  // to filtergraph. We also need to respect what was requested from the
-  // options; we respect the options unconditionally, so it's possible for
-  // swscale's width requirements to be violated. We don't expose the ability to
-  // choose color conversion library publicly; we only use this ability
-  // internally.
-
-  // swscale requires widths to be multiples of 32:
-  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
-  // so we fall back to filtergraph if the width is not a multiple of 32.
-  auto defaultLibrary = (expectedOutputWidth % 32 == 0)
-      ? ColorConversionLibrary::SWSCALE
-      : ColorConversionLibrary::FILTERGRAPH;
-
   ColorConversionLibrary colorConversionLibrary =
-      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
+      getColorConversionLibrary(videoStreamOptions, expectedOutputWidth);
 
   if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
     // We need to compare the current frame context with our previous frame
@@ -137,42 +182,16 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
 
     frameOutput.data = outputTensor;
   } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-    // See comment above in swscale branch about the filterGraphContext_
-    // creation. creation
-    std::stringstream filters;
-    filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
-    filters << ":sws_flags=bilinear";
+    TORCH_CHECK_EQ(avFrame->format, AV_PIX_FMT_RGB24);
 
-    FiltersContext filtersContext(
-        avFrame->width,
-        avFrame->height,
-        frameFormat,
-        avFrame->sample_aspect_ratio,
-        expectedOutputWidth,
-        expectedOutputHeight,
-        AV_PIX_FMT_RGB24,
-        filters.str(),
-        timeBase);
-
-    if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
-      filterGraphContext_ =
-          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
-      prevFiltersContext_ = std::move(filtersContext);
-    }
-    outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
-
-    // Similarly to above, if this check fails it means the frame wasn't
-    // reshaped to its expected dimensions by filtergraph.
-    auto shape = outputTensor.sizes();
-    TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
-            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
-        "Expected output tensor of shape ",
-        expectedOutputHeight,
-        "x",
-        expectedOutputWidth,
-        "x3, got ",
-        shape);
+    std::vector<int64_t> shape = {expectedOutputHeight, expectedOutputWidth, 3};
+    std::vector<int64_t> strides = {avFrame->linesize[0], 3, 1};
+    AVFrame* avFramePtr = avFrame.release();
+    auto deleter = [avFramePtr](void*) {
+      UniqueAVFrame avFrameToDelete(avFramePtr);
+    };
+    outputTensor = torch::from_blob(
+        avFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
 
     if (preAllocatedOutputTensor.has_value()) {
       // We have already validated that preAllocatedOutputTensor and
@@ -182,11 +201,6 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     } else {
       frameOutput.data = outputTensor;
     }
-  } else {
-    TORCH_CHECK(
-        false,
-        "Invalid color conversion library: ",
-        static_cast<int>(colorConversionLibrary));
   }
 }
 
@@ -208,25 +222,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwsScale(
   return resultHeight;
 }
 
-torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
-    const UniqueAVFrame& avFrame) {
-  UniqueAVFrame filteredAVFrame = filterGraphContext_->convert(avFrame);
-
-  TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
-
-  auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());
-  int height = frameDims.height;
-  int width = frameDims.width;
-  std::vector<int64_t> shape = {height, width, 3};
-  std::vector<int64_t> strides = {filteredAVFrame->linesize[0], 3, 1};
-  AVFrame* filteredAVFramePtr = filteredAVFrame.release();
-  auto deleter = [filteredAVFramePtr](void*) {
-    UniqueAVFrame avFrameToDelete(filteredAVFramePtr);
-  };
-  return torch::from_blob(
-      filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
-}
-
 void CpuDeviceInterface::createSwsContext(
     const SwsFrameContext& swsFrameContext,
     const enum AVColorSpace colorspace) {
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index d6004ca3b..f1982fb93 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -26,6 +26,11 @@ class CpuDeviceInterface : public DeviceInterface {
   void initializeContext(
       [[maybe_unused]] AVCodecContext* codecContext) override {}
 
+  std::unique_ptr<FiltersContext> initializeFiltersContext(
+      const VideoStreamOptions& videoStreamOptions,
+      const UniqueAVFrame& avFrame,
+      const AVRational& timeBase) override;
+
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
@@ -39,9 +44,6 @@ class CpuDeviceInterface : public DeviceInterface {
       const UniqueAVFrame& avFrame,
       torch::Tensor& outputTensor);
 
-  torch::Tensor convertAVFrameToTensorUsingFilterGraph(
-      const UniqueAVFrame& avFrame);
-
   struct SwsFrameContext {
     int inputWidth = 0;
     int inputHeight = 0;
@@ -64,15 +66,12 @@ class CpuDeviceInterface : public DeviceInterface {
       const SwsFrameContext& swsFrameContext,
       const enum AVColorSpace colorspace);
 
-  // color-conversion fields. Only one of FilterGraphContext and
-  // UniqueSwsContext should be non-null.
-  std::unique_ptr<FilterGraph> filterGraphContext_;
+  // SWS color conversion context
   UniqueSwsContext swsContext_;
 
-  // Used to know whether a new FilterGraphContext or UniqueSwsContext should
+  // Used to know whether a new UniqueSwsContext should
   // be created before decoding a new frame.
   SwsFrameContext prevSwsFrameContext_;
-  FiltersContext prevFiltersContext_;
 };
 
 } // namespace facebook::torchcodec

From 8f899e16d8154512a2548b23ace49c0c3804a0bc Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Tue, 2 Sep 2025 18:45:17 +0000
Subject: [PATCH 3/3] Drop timeBase from convertAVFrameToFrameOutput API

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/torchcodec/_core/CpuDeviceInterface.cpp  | 1 -
 src/torchcodec/_core/CpuDeviceInterface.h    | 1 -
 src/torchcodec/_core/CudaDeviceInterface.cpp | 7 +------
 src/torchcodec/_core/CudaDeviceInterface.h   | 1 -
 src/torchcodec/_core/DeviceInterface.h       | 1 -
 src/torchcodec/_core/SingleStreamDecoder.cpp | 1 -
 6 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 1eaa3be15..7aa0b48ce 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -117,7 +117,6 @@ std::unique_ptr<FiltersContext> CpuDeviceInterface::initializeFiltersContext(
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
 void CpuDeviceInterface::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] const AVRational& timeBase,
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index f1982fb93..1e86ded6b 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -33,7 +33,6 @@ class CpuDeviceInterface : public DeviceInterface {
 
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
-      const AVRational& timeBase,
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 375a9bc20..b34b2a6f3 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -263,7 +263,6 @@ std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
 
 void CudaDeviceInterface::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] const AVRational& timeBase,
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
@@ -281,11 +280,7 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
 
     FrameOutput cpuFrameOutput;
     cpuInterface->convertAVFrameToFrameOutput(
-        videoStreamOptions,
-        timeBase,
-        avFrame,
-        cpuFrameOutput,
-        preAllocatedOutputTensor);
+        videoStreamOptions, avFrame, cpuFrameOutput, preAllocatedOutputTensor);
 
     frameOutput.data = cpuFrameOutput.data.to(device_);
     return;
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index b49908443..45419b9f4 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -28,7 +28,6 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
-      const AVRational& timeBase,
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor =
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index 7916c81b2..90c05f049 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -48,7 +48,6 @@ class DeviceInterface {
 
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
-      const AVRational& timeBase,
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 01d9f3cf9..58e41a686 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1289,7 +1289,6 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
     }
     deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
-        streamInfo.timeBase,
         avFrame,
         frameOutput,
         preAllocatedOutputTensor);