changes

Dan-Flores · Dan-Flores · commit fb8b8fa5a14b · 2025-10-29T01:41:54.000-04:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -18,11 +18,6 @@ class CpuDeviceInterface : public DeviceInterface {
 
   virtual ~CpuDeviceInterface() {}
 
-  std::optional<const AVCodec*> findCodec(
-      [[maybe_unused]] const AVCodecID& codecId) override {
-    return std::nullopt;
-  }
-
   virtual void initialize(
       const AVStream* avStream,
       const UniqueDecodingAVFormatContext& avFormatCtx,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -329,11 +329,40 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
+namespace {
+// Helper function to check if a codec supports CUDA hardware acceleration
+bool codecSupportsCudaHardware(const AVCodec* codec) {
+  const AVCodecHWConfig* config = nullptr;
+  for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; ++j) {
+    if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
+      return true;
+    }
+  }
+  return false;
+}
+} // namespace
+
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
 // we have to do this because of an FFmpeg bug where hardware decoding is not
 // appropriately set, so we just go off and find the matching codec for the CUDA
 // device
-std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
+
+std::optional<const AVCodec*> CudaDeviceInterface::findEncoder(
+    const AVCodecID& codecId) {
+  void* i = nullptr;
+  const AVCodec* codec = nullptr;
+  while ((codec = av_codec_iterate(&i)) != nullptr) {
+    if (codec->id != codecId || !av_codec_is_encoder(codec)) {
+      continue;
+    }
+    if (codecSupportsCudaHardware(codec)) {
+      return codec;
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
     const AVCodecID& codecId) {
   void* i = nullptr;
   const AVCodec* codec = nullptr;
@@ -342,12 +371,8 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
       continue;
     }
 
-    const AVCodecHWConfig* config = nullptr;
-    for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
-         ++j) {
-      if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
-        return codec;
-      }
+    if (codecSupportsCudaHardware(codec)) {
+      return codec;
     }
   }
 
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -18,7 +18,8 @@ class CudaDeviceInterface : public DeviceInterface {
 
   virtual ~CudaDeviceInterface();
 
-  std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
+  std::optional<const AVCodec*> findEncoder(const AVCodecID& codecId) override;
+  std::optional<const AVCodec*> findDecoder(const AVCodecID& codecId) override;
 
   void initialize(
       const AVStream* avStream,
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -46,7 +46,12 @@ class DeviceInterface {
     return device_;
   };
 
-  virtual std::optional<const AVCodec*> findCodec(
+  virtual std::optional<const AVCodec*> findEncoder(
+      [[maybe_unused]] const AVCodecID& codecId) {
+    return std::nullopt;
+  };
+
+  virtual std::optional<const AVCodec*> findDecoder(
       [[maybe_unused]] const AVCodecID& codecId) {
     return std::nullopt;
   };
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -615,10 +615,25 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  deviceInterface_ = createDeviceInterface(
+      videoStreamOptions.device, videoStreamOptions.deviceVariant);
+  TORCH_CHECK(
+      deviceInterface_ != nullptr,
+      "Failed to create device interface. This should never happen, please report.");
+
   const AVCodec* avCodec =
       avcodec_find_encoder(avFormatContext_->oformat->video_codec);
   TORCH_CHECK(avCodec != nullptr, "Video codec not found");
 
+  // Try to find a hardware-accelerated encoder if not using CPU
+  if (videoStreamOptions.device.type() != torch::kCPU) {
+    auto hardwareCodec =
+        deviceInterface_->findEncoder(avFormatContext_->oformat->video_codec);
+    if (hardwareCodec.has_value()) {
+      avCodec = hardwareCodec.value();
+    }
+  }
+
   AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
@@ -668,6 +683,11 @@ void VideoEncoder::initializeEncoder(
         std::to_string(videoStreamOptions.crf.value()).c_str(),
         0);
   }
+
+  // Register the hardware device context with the codec
+  // context before calling avcodec_open2().
+  deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
   av_dict_free(&options);
 
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <torch/types.h>
 #include "src/torchcodec/_core/AVIOContextHolder.h"
+#include "src/torchcodec/_core/DeviceInterface.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/StreamOptions.h"
 
@@ -177,6 +178,7 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
+  std::unique_ptr<DeviceInterface> deviceInterface_;
 
   bool encodeWasCalled_ = false;
 };
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -40,7 +40,7 @@ AVPacket* ReferenceAVPacket::operator->() {
 
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
-#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) // FFmpeg < 5.0.3
   return const_cast<AVCodec*>(codec);
 #else
   return codec;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -435,7 +435,7 @@ void SingleStreamDecoder::addStream(
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
+        deviceInterface_->findDecoder(streamInfo.stream->codecpar->codec_id)
             .value_or(avCodec));
   }
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, str device=\"cpu\", int? crf=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, int? crf=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, int frame_rate, str format, str device=\"cpu\", int? crf=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, int? crf=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str device=\"cpu\",int? crf=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -603,9 +603,13 @@ void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view file_name,
+    std::string_view device = "cpu",
     std::optional<int64_t> crf = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.crf = crf;
+
+  videoStreamOptions.device = torch::Device(std::string(device));
+  videoStreamOptions.deviceVariant = "ffmpeg";
   VideoEncoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),
@@ -618,10 +622,14 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     int64_t frame_rate,
     std::string_view format,
+    std::string_view device = "cpu",
     std::optional<int64_t> crf = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.crf = crf;
+
+  videoStreamOptions.device = torch::Device(std::string(device));
+  videoStreamOptions.deviceVariant = "ffmpeg";
   return VideoEncoder(
              frames,
              validateInt64ToInt(frame_rate, "frame_rate"),
@@ -636,6 +644,7 @@ void _encode_video_to_file_like(
     int64_t frame_rate,
     std::string_view format,
     int64_t file_like_context,
+    std::string_view device = "cpu",
     std::optional<int64_t> crf = std::nullopt) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
@@ -646,6 +655,9 @@ void _encode_video_to_file_like(
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.crf = crf;
 
+  videoStreamOptions.device = torch::Device(std::string(device));
+  videoStreamOptions.deviceVariant = "ffmpeg";
+
   VideoEncoder encoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -212,6 +212,7 @@ def encode_video_to_file_like(
     frame_rate: int,
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    device: str = "cpu",
     crf: Optional[int] = None,
 ) -> None:
     """Encode video frames to a file-like object.
@@ -221,6 +222,7 @@ def encode_video_to_file_like(
         frame_rate: Frame rate in frames per second
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
+        device: Device to use for encoding (default: "cpu")
         crf: Optional constant rate factor for encoding quality
     """
     assert _pybind_ops is not None
@@ -230,6 +232,7 @@ def encode_video_to_file_like(
         frame_rate,
         format,
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        device,
         crf,
     )
 
@@ -318,7 +321,8 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     filename: str,
-    crf: Optional[int],
+    device: str = "cpu",
+    crf: Optional[int] = None,
 ) -> None:
     return
 
@@ -328,7 +332,8 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: int,
     format: str,
-    crf: Optional[int],
+    device: str = "cpu",
+    crf: Optional[int] = None,
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
 
@@ -339,6 +344,7 @@ def _encode_video_to_file_like_abstract(
     frame_rate: int,
     format: str,
     file_like_context: int,
+    device: str = "cpu",
     crf: Optional[int] = None,
 ) -> None:
     return
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
@@ -1,8 +1,8 @@
 from pathlib import Path
-from typing import Union
+from typing import Optional, Union
 
 import torch
-from torch import Tensor
+from torch import device as torch_device, Tensor
 
 from torchcodec import _core
 
@@ -18,9 +18,18 @@ class VideoEncoder:
             Values must be uint8 in the range ``[0, 255]``.
         frame_rate (int): The frame rate to use when encoding the
             **input** ``frames``.
+        device (str or torch.device, optional): The device to use for encoding. Default: "cpu".
+            If you pass a CUDA device, frames will be encoded on GPU.
+            Note: The "beta" CUDA backend is not supported for encoding.
     """
 
-    def __init__(self, frames: Tensor, *, frame_rate: int):
+    def __init__(
+        self,
+        frames: Tensor,
+        *,
+        frame_rate: int,
+        device: Optional[Union[str, torch_device]] = "cpu",
+    ):
         torch._C._log_api_usage_once("torchcodec.encoders.VideoEncoder")
         if not isinstance(frames, Tensor):
             raise ValueError(f"Expected frames to be a Tensor, got {type(frames) = }.")
@@ -34,8 +43,13 @@ def __init__(self, frames: Tensor, *, frame_rate: int):
         if frame_rate <= 0:
             raise ValueError(f"{frame_rate = } must be > 0.")
 
+        # Validate and store device
+        if isinstance(device, torch_device):
+            device = str(device)
+
         self._frames = frames
         self._frame_rate = frame_rate
+        self._device = device
 
     def to_file(
         self,
@@ -52,6 +66,7 @@ def to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
+            device=self._device,
         )
 
     def to_tensor(
@@ -71,6 +86,7 @@ def to_tensor(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
+            device=self._device,
         )
 
     def to_file_like(
@@ -94,4 +110,5 @@ def to_file_like(
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
+            device=self._device,
         )
diff --git a/test/test_encoders.py b/test/test_encoders.py
diff --git a/test/test_ops.py b/test/test_ops.py

Original file line number	Diff line number	Diff line change
`@@ -435,7 +435,7 @@ void SingleStreamDecoder::addStream(`
`435`	`435`	`// addStream() which is supposed to be generic`
`436`	`436`	`if (mediaType == AVMEDIA_TYPE_VIDEO) {`
`437`	`437`	`avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(`
`438`		`- deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)`
	`438`	`+ deviceInterface_->findDecoder(streamInfo.stream->codecpar->codec_id)`
`439`	`439`	`.value_or(avCodec));`
`440`	`440`	`}`
`441`	`441`