diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 45f6ba1a5..86c7d5e27 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -833,6 +833,16 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
+UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame(
+    [[maybe_unused]] const torch::Tensor& tensor,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    [[maybe_unused]] int frameIndex,
+    [[maybe_unused]] AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      false,
+      "Beta CUDA device interface does not support video encoding currently.");
+}
+
 std::string BetaCudaDeviceInterface::getDetails() {
   std::string details = "Beta CUDA Device Interface.";
   if (cpuFallback_) {
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
index cefb1a983..fba998a50 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.h
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -48,6 +48,12 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
   int receiveFrame(UniqueAVFrame& avFrame) override;
diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp
index bbd17db39..d31acc968 100644
--- a/src/torchcodec/_core/CUDACommon.cpp
+++ b/src/torchcodec/_core/CUDACommon.cpp
@@ -156,6 +156,21 @@ const Npp32f bt709FullRangeColorTwist[3][4] = {
     {1.0f, -0.187324273f, -0.468124273f, -128.0f},
     {1.0f, 1.8556f, 0.0f, -128.0f}};
 
+// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
+// Note: NPP's ColorTwist function apparently expects "limited range"
+// coefficient format even when producing full range output. All matrices below
+// use the limited range coefficient format (Y with +16 offset) for NPP
+// compatibility.
+
+// BT.601 limited range (matches FFmpeg default behavior)
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
+
 torch::Tensor convertNV12FrameToRGB(
     UniqueAVFrame& avFrame,
     const torch::Device& device,
@@ -246,6 +261,68 @@ torch::Tensor convertNV12FrameToRGB(
   return dst;
 }
 
+void convertRGBTensorToNV12Frame(
+    const torch::Tensor& rgbTensor,
+    UniqueAVFrame& nv12Frame,
+    const torch::Device& device,
+    const UniqueNppContext& nppCtx,
+    at::cuda::CUDAStream inputStream) {
+  TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device");
+  TORCH_CHECK(
+      rgbTensor.dim() == 3 && rgbTensor.size(0) == 3,
+      "Expected 3D RGB tensor in CHW format, got shape: ",
+      rgbTensor.sizes());
+  TORCH_CHECK(
+      nv12Frame != nullptr && nv12Frame->data[0] != nullptr,
+      "nv12Frame must be pre-allocated with CUDA memory");
+
+  // Convert CHW to HWC for NPP processing
+  int height = static_cast<int>(rgbTensor.size(1));
+  int width = static_cast<int>(rgbTensor.size(2));
+  torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous();
+
+  // Set up stream synchronization - make NPP stream wait for input tensor
+  // operations
+  at::cuda::CUDAStream nppStream =
+      at::cuda::getCurrentCUDAStream(device.index());
+  at::cuda::CUDAEvent inputDoneEvent;
+  inputDoneEvent.record(inputStream);
+  inputDoneEvent.block(nppStream);
+
+  // Setup NPP context
+  nppCtx->hStream = nppStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  // Always use FFmpeg's default behavior: BT.601 limited range
+  NppiSize oSizeROI = {width, height};
+
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      nv12Frame->data,
+      nv12Frame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // Validate CUDA operations completed successfully
+  cudaError_t memCheck = cudaGetLastError();
+  TORCH_CHECK(
+      memCheck == cudaSuccess,
+      "CUDA error detected: ",
+      cudaGetErrorString(memCheck));
+}
+
 UniqueNppContext getNppStreamContext(const torch::Device& device) {
   int deviceIndex = getDeviceIndex(device);
 
diff --git a/src/torchcodec/_core/CUDACommon.h b/src/torchcodec/_core/CUDACommon.h
index 4cc27c23b..15502540f 100644
--- a/src/torchcodec/_core/CUDACommon.h
+++ b/src/torchcodec/_core/CUDACommon.h
@@ -37,6 +37,13 @@ torch::Tensor convertNV12FrameToRGB(
     at::cuda::CUDAStream nvdecStream,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
 
+void convertRGBTensorToNV12Frame(
+    const torch::Tensor& rgbTensor,
+    UniqueAVFrame& nv12Frame,
+    const torch::Device& device,
+    const UniqueNppContext& nppCtx,
+    at::cuda::CUDAStream inputStream);
+
 UniqueNppContext getNppStreamContext(const torch::Device& device);
 void returnNppStreamContextToCache(
     const torch::Device& device,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
index 70f46b7e4..d7e58cb45 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.cpp
+++ b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -429,6 +429,84 @@ std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
       /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
 }
 
+UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame(
+    const torch::Tensor& frame,
+    AVPixelFormat outPixelFormat,
+    int frameIndex,
+    [[maybe_unused]] AVCodecContext* codecContext) {
+  int inHeight = static_cast<int>(frame.sizes()[1]);
+  int inWidth = static_cast<int>(frame.sizes()[2]);
+
+  // For now, reuse input dimensions as output dimensions
+  int outWidth = inWidth;
+  int outHeight = inHeight;
+
+  // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering)
+  AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP;
+
+  // Initialize and cache scaling context if it does not exist
+  if (!swsContext_) {
+    swsContext_.reset(sws_getContext(
+        inWidth,
+        inHeight,
+        inPixelFormat,
+        outWidth,
+        outHeight,
+        outPixelFormat,
+        SWS_BICUBIC, // Used by FFmpeg CLI
+        nullptr,
+        nullptr,
+        nullptr));
+    TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
+  }
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+
+  // Set output frame properties
+  avFrame->format = outPixelFormat;
+  avFrame->width = outWidth;
+  avFrame->height = outHeight;
+  avFrame->pts = frameIndex;
+
+  int status = av_frame_get_buffer(avFrame.get(), 0);
+  TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
+
+  // Need to convert/scale the frame
+  // Create temporary frame with input format
+  UniqueAVFrame inputFrame(av_frame_alloc());
+  TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
+
+  inputFrame->format = inPixelFormat;
+  inputFrame->width = inWidth;
+  inputFrame->height = inHeight;
+
+  uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
+
+  // TODO-VideoEncoder: Reorder tensor if in NHWC format
+  int channelSize = inHeight * inWidth;
+  // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
+  // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
+  inputFrame->data[0] = tensorData + channelSize;
+  inputFrame->data[1] = tensorData + (2 * channelSize);
+  inputFrame->data[2] = tensorData;
+
+  inputFrame->linesize[0] = inWidth;
+  inputFrame->linesize[1] = inWidth;
+  inputFrame->linesize[2] = inWidth;
+
+  status = sws_scale(
+      swsContext_.get(),
+      inputFrame->data,
+      inputFrame->linesize,
+      0,
+      inputFrame->height,
+      avFrame->data,
+      avFrame->linesize);
+  TORCH_CHECK(status == outHeight, "sws_scale failed");
+  return avFrame;
+}
+
 std::string CpuDeviceInterface::getDetails() {
   return std::string("CPU Device Interface.");
 }
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
index 801b83826..c33d5d051 100644
--- a/src/torchcodec/_core/CpuDeviceInterface.h
+++ b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -18,11 +18,6 @@ class CpuDeviceInterface : public DeviceInterface {
 
   virtual ~CpuDeviceInterface() {}
 
-  std::optional<const AVCodec*> findCodec(
-      [[maybe_unused]] const AVCodecID& codecId) override {
-    return std::nullopt;
-  }
-
   virtual void initialize(
       const AVStream* avStream,
       const UniqueDecodingAVFormatContext& avFormatCtx,
@@ -43,6 +38,12 @@ class CpuDeviceInterface : public DeviceInterface {
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   std::string getDetails() override;
 
  private:
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
index 0e20c5e8d..84cefc142 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -1,8 +1,10 @@
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
 #include <torch/types.h>
 #include <mutex>
 
+#include "CUDACommon.h"
 #include "Cache.h"
 #include "CudaDeviceInterface.h"
 #include "FFMPEGCommon.h"
@@ -144,6 +146,40 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
   codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
 }
 
+void CudaDeviceInterface::setupEncodingContext(AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  // is there any way to preserve actual desired format?
+  // codecContext->sw_pix_fmt = codecContext->pix_fmt;
+  // Should we always produce AV_PIX_FMT_NV12?
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate hardware frames context for codec");
+
+  AVHWFramesContext* hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format = codecContext->pix_fmt;
+  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
+  hwFramesCtx->width = codecContext->width;
+  hwFramesCtx->height = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize CUDA frames context for codec: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
+}
+
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
     UniqueAVFrame& avFrame) {
   // We need FFmpeg filters to handle those conversion cases which are not
@@ -329,11 +365,40 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
 
+namespace {
+// Helper function to check if a codec supports CUDA hardware acceleration
+bool codecSupportsCudaHardware(const AVCodec* codec) {
+  const AVCodecHWConfig* config = nullptr;
+  for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; ++j) {
+    if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
+      return true;
+    }
+  }
+  return false;
+}
+} // namespace
+
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
 // we have to do this because of an FFmpeg bug where hardware decoding is not
 // appropriately set, so we just go off and find the matching codec for the CUDA
 // device
-std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
+
+std::optional<const AVCodec*> CudaDeviceInterface::findEncoder(
+    const AVCodecID& codecId) {
+  void* i = nullptr;
+  const AVCodec* codec = nullptr;
+  while ((codec = av_codec_iterate(&i)) != nullptr) {
+    if (codec->id != codecId || !av_codec_is_encoder(codec)) {
+      continue;
+    }
+    if (codecSupportsCudaHardware(codec)) {
+      return codec;
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<const AVCodec*> CudaDeviceInterface::findDecoder(
     const AVCodecID& codecId) {
   void* i = nullptr;
   const AVCodec* codec = nullptr;
@@ -342,18 +407,52 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
       continue;
     }
 
-    const AVCodecHWConfig* config = nullptr;
-    for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
-         ++j) {
-      if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
-        return codec;
-      }
+    if (codecSupportsCudaHardware(codec)) {
+      return codec;
     }
   }
 
   return std::nullopt;
 }
 
+UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame(
+    const torch::Tensor& frame,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors");
+  TORCH_CHECK(
+      frame.dim() == 3 && frame.size(0) == 3,
+      "Expected 3D RGB tensor (CHW format), got shape: ",
+      frame.sizes());
+
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->width = static_cast<int>(frame.size(2));
+  avFrame->height = static_cast<int>(frame.size(1));
+  avFrame->pts = frameIndex;
+
+  int ret = av_hwframe_get_buffer(
+      codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate hardware frame: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  at::cuda::CUDAStream currentStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+
+  convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream);
+
+  // Set color properties to FFmpeg defaults
+  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+
+  return avFrame;
+}
+
 std::string CudaDeviceInterface::getDetails() {
   // Note: for this interface specifically the fallback is only known after a
   // frame has been decoded, not before: that's when FFmpeg decides to fallback,
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
index c892bd49b..83761020c 100644
--- a/src/torchcodec/_core/CudaDeviceInterface.h
+++ b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -18,7 +18,8 @@ class CudaDeviceInterface : public DeviceInterface {
 
   virtual ~CudaDeviceInterface();
 
-  std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
+  std::optional<const AVCodec*> findEncoder(const AVCodecID& codecId) override;
+  std::optional<const AVCodec*> findDecoder(const AVCodecID& codecId) override;
 
   void initialize(
       const AVStream* avStream,
@@ -34,11 +35,19 @@ class CudaDeviceInterface : public DeviceInterface {
 
   void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override;
 
+  void setupEncodingContext(AVCodecContext* codecContext) override;
+
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
+  UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) override;
+
   std::string getDetails() override;
 
  private:
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
index 319fe01a8..2b69dbfc9 100644
--- a/src/torchcodec/_core/DeviceInterface.h
+++ b/src/torchcodec/_core/DeviceInterface.h
@@ -46,7 +46,12 @@ class DeviceInterface {
     return device_;
   };
 
-  virtual std::optional<const AVCodec*> findCodec(
+  virtual std::optional<const AVCodec*> findEncoder(
+      [[maybe_unused]] const AVCodecID& codecId) {
+    return std::nullopt;
+  };
+
+  virtual std::optional<const AVCodec*> findDecoder(
       [[maybe_unused]] const AVCodecID& codecId) {
     return std::nullopt;
   };
@@ -87,11 +92,25 @@ class DeviceInterface {
   virtual void registerHardwareDeviceWithCodec(
       [[maybe_unused]] AVCodecContext* codecContext) {}
 
+  // Setup device-specific encoding context (e.g., hardware frame contexts).
+  // Called after registerHardwareDeviceWithCodec for encoders.
+  // Default implementation does nothing (suitable for CPU and basic cases).
+  virtual void setupEncodingContext(
+      [[maybe_unused]] AVCodecContext* codecContext) {}
+
   virtual void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
       std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt) = 0;
 
+  // Convert tensor to AVFrame, implemented per device interface.
+  // This is similar to convertAVFrameToFrameOutput for encoding
+  virtual UniqueAVFrame convertTensorToAVFrame(
+      const torch::Tensor& tensor,
+      AVPixelFormat targetFormat,
+      int frameIndex,
+      AVCodecContext* codecContext) = 0;
+
   // ------------------------------------------
   // Extension points for custom decoding paths
   // ------------------------------------------
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
index 19ac9220d..4e6fde742 100644
--- a/src/torchcodec/_core/Encoder.cpp
+++ b/src/torchcodec/_core/Encoder.cpp
@@ -5,6 +5,7 @@
 #include "torch/types.h"
 
 extern "C" {
+#include <libavutil/hwcontext.h>
 #include <libavutil/opt.h>
 #include <libavutil/pixdesc.h>
 }
@@ -523,7 +524,9 @@ void AudioEncoder::flushBuffers() {
 
 namespace {
 
-torch::Tensor validateFrames(const torch::Tensor& frames) {
+torch::Tensor validateFrames(
+    const torch::Tensor& frames,
+    const torch::Device& device) {
   TORCH_CHECK(
       frames.dtype() == torch::kUInt8,
       "frames must have uint8 dtype, got ",
@@ -536,6 +539,15 @@ torch::Tensor validateFrames(const torch::Tensor& frames) {
       frames.sizes()[1] == 3,
       "frame must have 3 channels (R, G, B), got ",
       frames.sizes()[1]);
+  if (device.type() != torch::kCPU) {
+    TORCH_CHECK(
+        frames.is_cuda(),
+        "When using CUDA encoding (device=",
+        device.str(),
+        "), frames must be on a CUDA device. Got frames on ",
+        frames.device().str(),
+        ". Please move frames to a CUDA device: frames.to('cuda')");
+  }
   return frames.contiguous();
 }
 
@@ -665,7 +677,8 @@ VideoEncoder::VideoEncoder(
     double frameRate,
     std::string_view fileName,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames)), inFrameRate_(frameRate) {
+    : frames_(validateFrames(frames, videoStreamOptions.device)),
+      inFrameRate_(frameRate) {
   setFFmpegLogLevel();
 
   // Allocate output format context
@@ -698,7 +711,7 @@ VideoEncoder::VideoEncoder(
     std::string_view formatName,
     std::unique_ptr<AVIOContextHolder> avioContextHolder,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames)),
+    : frames_(validateFrames(frames, videoStreamOptions.device)),
       inFrameRate_(frameRate),
       avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
@@ -724,6 +737,12 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  deviceInterface_ = createDeviceInterface(
+      videoStreamOptions.device, videoStreamOptions.deviceVariant);
+  TORCH_CHECK(
+      deviceInterface_ != nullptr,
+      "Failed to create device interface. This should never happen, please report.");
+
   const AVCodec* avCodec = nullptr;
   // If codec arg is provided, find codec using logic similar to FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
@@ -748,7 +767,13 @@ void VideoEncoder::initializeEncoder(
     TORCH_CHECK(
         avFormatContext_->oformat != nullptr,
         "Output format is null, unable to find default codec.");
+    // Try to find a hardware-accelerated encoder if not using CPU
     avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
+    if (videoStreamOptions.device.type() != torch::kCPU) {
+      avCodec =
+          deviceInterface_->findEncoder(avFormatContext_->oformat->video_codec)
+              .value_or(avCodec);
+    }
     TORCH_CHECK(avCodec != nullptr, "Video codec not found");
   }
 
@@ -820,6 +845,14 @@ void VideoEncoder::initializeEncoder(
         videoStreamOptions.preset.value().c_str(),
         0);
   }
+
+  // Register the hardware device context with the codec
+  // context before calling avcodec_open2().
+  deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+
+  // Setup device-specific encoding context (e.g., hardware frame contexts)
+  deviceInterface_->setupEncodingContext(avCodecContext_.get());
+
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
   av_dict_free(&avCodecOptions);
 
@@ -860,7 +893,8 @@ void VideoEncoder::encode() {
   int numFrames = static_cast<int>(frames_.sizes()[0]);
   for (int i = 0; i < numFrames; ++i) {
     torch::Tensor currFrame = frames_[i];
-    UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
+    UniqueAVFrame avFrame = deviceInterface_->convertTensorToAVFrame(
+        currFrame, outPixelFormat_, i, avCodecContext_.get());
     encodeFrame(autoAVPacket, avFrame);
   }
 
@@ -873,72 +907,6 @@ void VideoEncoder::encode() {
       getFFMPEGErrorStringFromErrorCode(status));
 }
 
-UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
-    const torch::Tensor& frame,
-    int frameIndex) {
-  // Initialize and cache scaling context if it does not exist
-  if (!swsContext_) {
-    swsContext_.reset(sws_getContext(
-        inWidth_,
-        inHeight_,
-        inPixelFormat_,
-        outWidth_,
-        outHeight_,
-        outPixelFormat_,
-        SWS_BICUBIC, // Used by FFmpeg CLI
-        nullptr,
-        nullptr,
-        nullptr));
-    TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
-  }
-
-  UniqueAVFrame avFrame(av_frame_alloc());
-  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
-
-  // Set output frame properties
-  avFrame->format = outPixelFormat_;
-  avFrame->width = outWidth_;
-  avFrame->height = outHeight_;
-  avFrame->pts = frameIndex;
-
-  int status = av_frame_get_buffer(avFrame.get(), 0);
-  TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
-
-  // Need to convert/scale the frame
-  // Create temporary frame with input format
-  UniqueAVFrame inputFrame(av_frame_alloc());
-  TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
-
-  inputFrame->format = inPixelFormat_;
-  inputFrame->width = inWidth_;
-  inputFrame->height = inHeight_;
-
-  uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
-
-  // TODO-VideoEncoder: Reorder tensor if in NHWC format
-  int channelSize = inHeight_ * inWidth_;
-  // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
-  // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
-  inputFrame->data[0] = tensorData + channelSize;
-  inputFrame->data[1] = tensorData + (2 * channelSize);
-  inputFrame->data[2] = tensorData;
-
-  inputFrame->linesize[0] = inWidth_;
-  inputFrame->linesize[1] = inWidth_;
-  inputFrame->linesize[2] = inWidth_;
-
-  status = sws_scale(
-      swsContext_.get(),
-      inputFrame->data,
-      inputFrame->linesize,
-      0,
-      inputFrame->height,
-      avFrame->data,
-      avFrame->linesize);
-  TORCH_CHECK(status == outHeight_, "sws_scale failed");
-  return avFrame;
-}
-
 torch::Tensor VideoEncoder::encodeToTensor() {
   TORCH_CHECK(
       avioContextHolder_ != nullptr,
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
index 1bdc1e443..c32e44943 100644
--- a/src/torchcodec/_core/Encoder.h
+++ b/src/torchcodec/_core/Encoder.h
@@ -3,6 +3,7 @@
 #include <map>
 #include <string>
 #include "AVIOContextHolder.h"
+#include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
 #include "StreamOptions.h"
 
@@ -160,9 +161,6 @@ class VideoEncoder {
 
  private:
   void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
-  UniqueAVFrame convertTensorToAVFrame(
-      const torch::Tensor& frame,
-      int frameIndex);
   void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
   void flushBuffers();
 
@@ -183,6 +181,7 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
+  std::unique_ptr<DeviceInterface> deviceInterface_;
 
   bool encodeWasCalled_ = false;
   AVDictionary* avFormatOptions_ = nullptr;
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
index e1b88b36a..14ddd3e6e 100644
--- a/src/torchcodec/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -40,7 +40,7 @@ AVPacket* ReferenceAVPacket::operator->() {
 
 AVCodecOnlyUseForCallingAVFindBestStream
 makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
-#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) // FFmpeg < 5.0.3
   return const_cast<AVCodec*>(codec);
 #else
   return codec;
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index ac7489bbe..dd1d9cbb3 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -462,7 +462,7 @@ void SingleStreamDecoder::addStream(
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
     avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
+        deviceInterface_->findDecoder(streamInfo.stream->codecpar->codec_id)
             .value_or(avCodec));
   }
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 4ec72974d..750b52ae8 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, float frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
+      "encode_video_to_file(Tensor frames, float frame_rate, str filename, str device=\"cpu\", str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
   m.def(
-      "encode_video_to_tensor(Tensor frames, float frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
+      "encode_video_to_tensor(Tensor frames, float frame_rate, str format, str device=\"cpu\", str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
   m.def(
-      "_encode_video_to_file_like(Tensor frames, float frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
+      "_encode_video_to_file_like(Tensor frames, float frame_rate, str format, int file_like_context, str device=\"cpu\",str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
@@ -415,7 +415,6 @@ void _add_video_stream(
   }
 
   validateDeviceInterface(std::string(device), std::string(device_variant));
-
   videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.deviceVariant = device_variant;
 
@@ -641,12 +640,14 @@ void encode_video_to_file(
     const at::Tensor& frames,
     double frame_rate,
     std::string_view file_name,
+    std::string_view device = "cpu",
     std::optional<std::string_view> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
     std::optional<std::string_view> preset = std::nullopt,
     std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
@@ -664,6 +665,7 @@ at::Tensor encode_video_to_tensor(
     const at::Tensor& frames,
     double frame_rate,
     std::string_view format,
+    std::string_view device = "cpu",
     std::optional<std::string_view> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
@@ -671,6 +673,7 @@ at::Tensor encode_video_to_tensor(
     std::optional<std::vector<std::string>> extra_options = std::nullopt) {
   auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
@@ -695,6 +698,7 @@ void _encode_video_to_file_like(
     double frame_rate,
     std::string_view format,
     int64_t file_like_context,
+    std::string_view device = "cpu",
     std::optional<std::string_view> codec = std::nullopt,
     std::optional<std::string_view> pixel_format = std::nullopt,
     std::optional<double> crf = std::nullopt,
@@ -707,6 +711,7 @@ void _encode_video_to_file_like(
   std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.codec = std::move(codec);
   videoStreamOptions.pixelFormat = std::move(pixel_format);
   videoStreamOptions.crf = crf;
@@ -1019,6 +1024,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("_create_from_file_like", &_create_from_file_like);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
+  m.impl("encode_video_to_file", &encode_video_to_file);
+  m.impl("encode_video_to_tensor", &encode_video_to_tensor);
+  m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
 }
 
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 160e273bb..c25031cc2 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -213,6 +213,7 @@ def encode_video_to_file_like(
     frame_rate: float,
     format: str,
     file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    device: Optional[str] = "cpu",
     codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
     crf: Optional[Union[int, float]] = None,
@@ -226,6 +227,7 @@ def encode_video_to_file_like(
         frame_rate: Frame rate in frames per second
         format: Video format (e.g., "mp4", "mov", "mkv")
         file_like: File-like object that supports write() and seek() methods
+        device: Device to use for encoding (default: "cpu")
         codec: Optional codec name (e.g., "libx264", "h264")
         pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
         crf: Optional constant rate factor for encoding quality
@@ -239,6 +241,7 @@ def encode_video_to_file_like(
         frame_rate,
         format,
         _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        device,
         codec,
         pixel_format,
         crf,
@@ -331,6 +334,7 @@ def encode_video_to_file_abstract(
     frames: torch.Tensor,
     frame_rate: float,
     filename: str,
+    device: str = "cpu",
     codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
@@ -345,6 +349,7 @@ def encode_video_to_tensor_abstract(
     frames: torch.Tensor,
     frame_rate: float,
     format: str,
+    device: str = "cpu",
     codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
@@ -360,6 +365,7 @@ def _encode_video_to_file_like_abstract(
     frame_rate: float,
     format: str,
     file_like_context: int,
+    device: str = "cpu",
     codec: Optional[str] = None,
     pixel_format: Optional[str] = None,
     preset: Optional[str] = None,
diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py
index 49ece70b6..8a1a7bc6d 100644
--- a/src/torchcodec/encoders/_video_encoder.py
+++ b/src/torchcodec/encoders/_video_encoder.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, Optional, Union
 
 import torch
-from torch import Tensor
+from torch import device as torch_device, Tensor
 
 from torchcodec import _core
 
@@ -16,9 +16,18 @@ class VideoEncoder:
             C is 3 channels (RGB), H is height, and W is width.
             Values must be uint8 in the range ``[0, 255]``.
         frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
+        device (str or torch.device, optional): The device to use for encoding. Default: "cpu".
+            If you pass a CUDA device, frames will be encoded on GPU.
+            Note: The "beta" CUDA backend is not supported for encoding.
     """
 
-    def __init__(self, frames: Tensor, *, frame_rate: float):
+    def __init__(
+        self,
+        frames: Tensor,
+        *,
+        frame_rate: float,
+        device: Optional[Union[str, torch_device]] = "cpu",
+    ):
         torch._C._log_api_usage_once("torchcodec.encoders.VideoEncoder")
         if not isinstance(frames, Tensor):
             raise ValueError(f"Expected frames to be a Tensor, got {type(frames) = }.")
@@ -29,8 +38,13 @@ def __init__(self, frames: Tensor, *, frame_rate: float):
         if frame_rate <= 0:
             raise ValueError(f"{frame_rate = } must be > 0.")
 
+        # Validate and store device
+        if isinstance(device, torch_device):
+            device = str(device)
+
         self._frames = frames
         self._frame_rate = frame_rate
+        self._device = device
 
     def to_file(
         self,
@@ -69,6 +83,7 @@ def to_file(
             frames=self._frames,
             frame_rate=self._frame_rate,
             filename=str(dest),
+            device=self._device,
             codec=codec,
             pixel_format=pixel_format,
             crf=crf,
@@ -117,6 +132,7 @@ def to_tensor(
             frames=self._frames,
             frame_rate=self._frame_rate,
             format=format,
+            device=self._device,
             codec=codec,
             pixel_format=pixel_format,
             crf=crf,
@@ -169,6 +185,7 @@ def to_file_like(
             frame_rate=self._frame_rate,
             format=format,
             file_like=file_like,
+            device=self._device,
             codec=codec,
             pixel_format=pixel_format,
             crf=crf,
diff --git a/test/test_encoders.py b/test/test_encoders.py
index 543025599..eb3d8193f 100644
--- a/test/test_encoders.py
+++ b/test/test_encoders.py
@@ -10,7 +10,6 @@
 import pytest
 import torch
 from torchcodec.decoders import AudioDecoder, VideoDecoder
-
 from torchcodec.encoders import AudioEncoder, VideoEncoder
 
 from .utils import (
@@ -796,15 +795,22 @@ def test_extra_options_errors(self, method, tmp_path, extra_options, error):
             getattr(encoder, method)(**valid_params, extra_options=extra_options)
 
     @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
-    def test_contiguity(self, method, tmp_path):
+    @pytest.mark.parametrize(
+        "device", ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
+    )
+    def test_contiguity(self, method, tmp_path, device):
         # Ensure that 2 sets of video frames with the same pixel values are encoded
         # in the same way, regardless of their memory layout. Here we encode 2 equal
         # frame tensors, one is contiguous while the other is non-contiguous.
 
-        num_frames, channels, height, width = 5, 3, 64, 64
-        contiguous_frames = torch.randint(
-            0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8
-        ).contiguous()
+        num_frames, channels, height, width = 5, 3, 256, 256
+        contiguous_frames = (
+            torch.randint(
+                0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8
+            )
+            .contiguous()
+            .to(device)
+        )
         assert contiguous_frames.is_contiguous()
 
         # Permute NCHW to NHWC, then update the memory layout, then permute back
@@ -823,16 +829,18 @@ def encode_to_tensor(frames):
             common_params = dict(crf=0, pixel_format="yuv444p")
             if method == "to_file":
                 dest = str(tmp_path / "output.mp4")
-                VideoEncoder(frames, frame_rate=30).to_file(dest=dest, **common_params)
+                VideoEncoder(frames, frame_rate=30, device=device).to_file(
+                    dest=dest, **common_params
+                )
                 with open(dest, "rb") as f:
-                    return torch.frombuffer(f.read(), dtype=torch.uint8)
+                    return torch.frombuffer(f.read(), dtype=torch.uint8).clone()
             elif method == "to_tensor":
-                return VideoEncoder(frames, frame_rate=30).to_tensor(
+                return VideoEncoder(frames, frame_rate=30, device=device).to_tensor(
                     format="mp4", **common_params
                 )
             elif method == "to_file_like":
                 file_like = io.BytesIO()
-                VideoEncoder(frames, frame_rate=30).to_file_like(
+                VideoEncoder(frames, frame_rate=30, device=device).to_file_like(
                     file_like, format="mp4", **common_params
                 )
                 return torch.frombuffer(file_like.getvalue(), dtype=torch.uint8)
@@ -1259,3 +1267,100 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
         assert metadata["profile"].lower() == expected_profile
         assert metadata["color_space"] == colorspace
         assert metadata["color_range"] == color_range
+
+    @pytest.mark.needs_cuda
+    @pytest.mark.skipif(in_fbcode(), reason="ffmpeg CLI not available")
+    @pytest.mark.parametrize("preset", ("slow", "fast"))
+    @pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p"))
+    @pytest.mark.parametrize("format", ("mov", "mp4", "avi", "mkv", "flv"))
+    @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like"))
+    def test_nvenc_against_ffmpeg_cli(
+        self, tmp_path, preset, pixel_format, format, method
+    ):
+        # Encode with FFmpeg CLI using h264_nvenc
+        device = "cuda"
+        source_frames = self.decode(TEST_SRC_2_720P.path).data.to(device)
+
+        temp_raw_path = str(tmp_path / "temp_input.raw")
+        with open(temp_raw_path, "wb") as f:
+            f.write(source_frames.permute(0, 2, 3, 1).cpu().numpy().tobytes())
+
+        ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_nvenc_output.{format}")
+        frame_rate = 30
+
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-y",
+            "-f",
+            "rawvideo",
+            "-pix_fmt",
+            "rgb24",  # Input format
+            "-s",
+            f"{source_frames.shape[3]}x{source_frames.shape[2]}",
+            "-r",
+            str(frame_rate),
+            "-i",
+            temp_raw_path,
+            "-c:v",
+            "h264_nvenc",  # Use NVENC hardware encoder
+        ]
+
+        ffmpeg_cmd.extend(["-pix_fmt", pixel_format])  # Output format
+        ffmpeg_cmd.extend(["-preset", preset])  # Use parametrized preset
+        ffmpeg_cmd.extend(["-qp", "0"])  # Use lossless qp for consistency
+        ffmpeg_cmd.extend([ffmpeg_encoded_path])
+
+        # Will this prevent CI from treating test as failed if NVENC is not available?
+        try:
+            subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+        except subprocess.CalledProcessError as e:
+            if b"No NVENC capable devices found" in e.stderr:
+                pytest.skip("NVENC not available on this system")
+            else:
+                raise
+
+        encoder = VideoEncoder(
+            frames=source_frames, frame_rate=frame_rate, device=device
+        )
+
+        encoder_extra_options = {"qp": 0}
+        if method == "to_file":
+            encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
+            encoder.to_file(
+                dest=encoder_output_path,
+                codec="h264_nvenc",
+                pixel_format=pixel_format,
+                preset=preset,
+                extra_options=encoder_extra_options,
+            )
+            encoder_output = encoder_output_path
+        elif method == "to_tensor":
+            encoder_output = encoder.to_tensor(
+                format=format,
+                codec="h264_nvenc",
+                pixel_format=pixel_format,
+                preset=preset,
+                extra_options=encoder_extra_options,
+            )
+        elif method == "to_file_like":
+            file_like = io.BytesIO()
+            encoder.to_file_like(
+                file_like=file_like,
+                format=format,
+                codec="h264_nvenc",
+                pixel_format=pixel_format,
+                preset=preset,
+                extra_options=encoder_extra_options,
+            )
+            encoder_output = file_like.getvalue()
+        else:
+            raise ValueError(f"Unknown method: {method}")
+
+        ffmpeg_frames = self.decode(ffmpeg_encoded_path).data
+        encoder_frames = self.decode(encoder_output).data
+
+        assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
+        for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
+            assert psnr(ff_frame, enc_frame) > 25
+            assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=99, atol=10)
+            assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=95, atol=2)