Use cuda filters to support 10-bit videos

dvrogozh · dvrogozh · commit 58b492163012 · 2025-08-26T21:46:18.000Z
For: #776 Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -199,6 +199,68 @@ void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
   return;
 }
 
+std::unique_ptr<FiltersContext> CudaDeviceInterface::initializeFiltersContext(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame,
+    const AVRational& timeBase) {
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(avFrame->format);
+
+  if (avFrame->format != AV_PIX_FMT_CUDA) {
+    auto cpuDevice = torch::Device(torch::kCPU);
+    auto cpuInterface = createDeviceInterface(cpuDevice);
+    return cpuInterface->initializeFiltersContext(
+        videoStreamOptions, avFrame, timeBase);
+  }
+
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
+  int height = frameDims.height;
+  int width = frameDims.width;
+
+  auto hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
+  AVPixelFormat actualFormat = hwFramesCtx->sw_format;
+
+  if (actualFormat == AV_PIX_FMT_NV12) {
+    return nullptr;
+  }
+
+  std::unique_ptr<FiltersContext> filtersContext =
+      std::make_unique<FiltersContext>();
+
+  filtersContext->inputWidth = avFrame->width;
+  filtersContext->inputHeight = avFrame->height;
+  filtersContext->inputFormat = frameFormat;
+  filtersContext->inputAspectRatio = avFrame->sample_aspect_ratio;
+  filtersContext->timeBase = timeBase;
+  filtersContext->hwFramesCtx.reset(av_buffer_ref(avFrame->hw_frames_ctx));
+
+  std::stringstream filters;
+
+  unsigned version_int = avfilter_version();
+  if (version_int < AV_VERSION_INT(8, 0, 103)) {
+    // Color conversion support ('format=' option) was added to scale_cuda from
+    // n5.0. With the earlier version of ffmpeg we have no choice but use CPU
+    // filters. See:
+    // https://github.com/FFmpeg/FFmpeg/commit/62dc5df941f5e196164c151691e4274195523e95
+    filtersContext->outputFormat = AV_PIX_FMT_RGB24;
+
+    filters << "hwdownload,format=" << av_pix_fmt_desc_get(actualFormat)->name;
+    filters << ",scale=" << width << ":" << height;
+    filters << ":sws_flags=bilinear";
+  } else {
+    // Actual output color format will be set via filter options
+    filtersContext->outputFormat = AV_PIX_FMT_CUDA;
+
+    filters << "scale_cuda=" << width << ":" << height;
+    filters << ":format=nv12:interp_algo=bilinear";
+  }
+
+  filtersContext->filters = filters.str();
+  return filtersContext;
+}
+
 void CudaDeviceInterface::convertAVFrameToFrameOutput(
     const VideoStreamOptions& videoStreamOptions,
     [[maybe_unused]] const AVRational& timeBase,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -21,6 +21,11 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void initializeContext(AVCodecContext* codecContext) override;
 
+  std::unique_ptr<FiltersContext> initializeFiltersContext(
+      const VideoStreamOptions& videoStreamOptions,
+      const UniqueAVFrame& avFrame,
+      const AVRational& timeBase) override;
+
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -12,6 +12,7 @@
 #include <stdexcept>
 #include <string>
 #include "FFMPEGCommon.h"
+#include "src/torchcodec/_core/FilterGraph.h"
 #include "src/torchcodec/_core/Frame.h"
 #include "src/torchcodec/_core/StreamOptions.h"
 
@@ -41,6 +42,18 @@ class DeviceInterface {
   // support CUDA and others only support CPU.
   virtual void initializeContext(AVCodecContext* codecContext) = 0;
 
+  // Returns FilterContext if device interface can't handle conversion of the
+  // frame on its own within a call to convertAVFrameToFrameOutput().
+  // FilterContext contains input and output initialization parameters
+  // describing required conversion. Output can further be passed to
+  // convertAVFrameToFrameOutput() to generate output tensor.
+  virtual std::unique_ptr<FiltersContext> initializeFiltersContext(
+      [[maybe_unused]] const VideoStreamOptions& videoStreamOptions,
+      [[maybe_unused]] const UniqueAVFrame& avFrame,
+      [[maybe_unused]] const AVRational& timeBase) {
+    return nullptr;
+  };
+
   virtual void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1254,6 +1254,17 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
   } else if (deviceInterface_) {
+    std::unique_ptr<FiltersContext> filtersContext =
+        deviceInterface_->initializeFiltersContext(
+            streamInfo.videoStreamOptions, avFrame, streamInfo.timeBase);
+    if (filtersContext) {
+      if (!filterGraph_ || prevFiltersContext_ != filtersContext) {
+        filterGraph_ = std::make_unique<FilterGraph>(
+            *filtersContext, streamInfo.videoStreamOptions);
+        prevFiltersContext_ = std::move(filtersContext);
+      }
+      avFrame = filterGraph_->convert(avFrame);
+    }
     deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
         streamInfo.timeBase,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -351,6 +351,10 @@ class SingleStreamDecoder {
   SeekMode seekMode_;
   ContainerMetadata containerMetadata_;
   UniqueDecodingAVFormatContext formatContext_;
+  // Previous frame filter context. Used to know whether a new FilterGraph
+  // should be created to process a next frame.
+  std::unique_ptr<FiltersContext> prevFiltersContext_;
+  std::unique_ptr<FilterGraph> filterGraph_;
   std::unique_ptr<DeviceInterface> deviceInterface_;
   std::map<int, StreamInfo> streamInfos_;
   const int NO_ACTIVE_STREAM = -2;
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1225,22 +1225,6 @@ def test_full_and_studio_range_bt709_video(self, asset):
             elif cuda_version_used_for_building_torch() == (12, 8):
                 assert psnr(gpu_frame, cpu_frame) > 20
 
-    @needs_cuda
-    def test_10bit_videos_cuda(self):
-        # Assert that we raise proper error on different kinds of 10bit videos.
-
-        # TODO we should investigate how to support 10bit videos on GPU.
-        # See https://github.com/pytorch/torchcodec/issues/776
-
-        asset = H265_10BITS
-
-        decoder = VideoDecoder(asset.path, device="cuda")
-        with pytest.raises(
-            RuntimeError,
-            match="The AVFrame is p010le, but we expected AV_PIX_FMT_NV12.",
-        ):
-            decoder.get_frame_at(0)
-
     @needs_cuda
     def test_10bit_gpu_fallsback_to_cpu(self):
         # Test for 10-bit videos that aren't supported by NVDEC: we decode and
@@ -1272,12 +1256,13 @@ def test_10bit_gpu_fallsback_to_cpu(self):
         frames_cpu = decoder_cpu.get_frames_at(frame_indices).data
         assert_frames_equal(frames_gpu.cpu(), frames_cpu)
 
+    @pytest.mark.parametrize("device", all_supported_devices())
     @pytest.mark.parametrize("asset", (H264_10BITS, H265_10BITS))
-    def test_10bit_videos_cpu(self, asset):
-        # This just validates that we can decode 10-bit videos on CPU.
+    def test_10bit_videos(self, device, asset):
+        # This just validates that we can decode 10-bit videos.
         # TODO validate against the ref that the decoded frames are correct
 
-        decoder = VideoDecoder(asset.path)
+        decoder = VideoDecoder(asset.path, device=device)
         decoder.get_frame_at(10)
 
     def setup_frame_mappings(tmp_path, file, stream_index):