diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp index 45f6ba1a5..86c7d5e27 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp @@ -833,6 +833,16 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput( gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); } +UniqueAVFrame BetaCudaDeviceInterface::convertTensorToAVFrame( + [[maybe_unused]] const torch::Tensor& tensor, + [[maybe_unused]] AVPixelFormat targetFormat, + [[maybe_unused]] int frameIndex, + [[maybe_unused]] AVCodecContext* codecContext) { + TORCH_CHECK( + false, + "Beta CUDA device interface does not support video encoding currently."); +} + std::string BetaCudaDeviceInterface::getDetails() { std::string details = "Beta CUDA Device Interface."; if (cpuFallback_) { diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h index cefb1a983..fba998a50 100644 --- a/src/torchcodec/_core/BetaCudaDeviceInterface.h +++ b/src/torchcodec/_core/BetaCudaDeviceInterface.h @@ -48,6 +48,12 @@ class BetaCudaDeviceInterface : public DeviceInterface { FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) override; + UniqueAVFrame convertTensorToAVFrame( + const torch::Tensor& tensor, + AVPixelFormat targetFormat, + int frameIndex, + AVCodecContext* codecContext) override; + int sendPacket(ReferenceAVPacket& packet) override; int sendEOFPacket() override; int receiveFrame(UniqueAVFrame& avFrame) override; diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp index bbd17db39..d31acc968 100644 --- a/src/torchcodec/_core/CUDACommon.cpp +++ b/src/torchcodec/_core/CUDACommon.cpp @@ -156,6 +156,21 @@ const Npp32f bt709FullRangeColorTwist[3][4] = { {1.0f, -0.187324273f, -0.468124273f, -128.0f}, {1.0f, 1.8556f, 0.0f, -128.0f}}; +// RGB to NV12 color conversion matrices (inverse of YUV to RGB) +// Note: NPP's ColorTwist function apparently expects "limited range" +// coefficient format even when producing full range output. All matrices below +// use the limited range coefficient format (Y with +16 offset) for NPP +// compatibility. + +// BT.601 limited range (matches FFmpeg default behavior) +const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { + // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) + {0.257f, 0.504f, 0.098f, 16.0f}, + // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) + {-0.148f, -0.291f, 0.439f, 128.0f}, + // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) + {0.439f, -0.368f, -0.071f, 128.0f}}; + torch::Tensor convertNV12FrameToRGB( UniqueAVFrame& avFrame, const torch::Device& device, @@ -246,6 +261,68 @@ torch::Tensor convertNV12FrameToRGB( return dst; } +void convertRGBTensorToNV12Frame( + const torch::Tensor& rgbTensor, + UniqueAVFrame& nv12Frame, + const torch::Device& device, + const UniqueNppContext& nppCtx, + at::cuda::CUDAStream inputStream) { + TORCH_CHECK(rgbTensor.is_cuda(), "RGB tensor must be on CUDA device"); + TORCH_CHECK( + rgbTensor.dim() == 3 && rgbTensor.size(0) == 3, + "Expected 3D RGB tensor in CHW format, got shape: ", + rgbTensor.sizes()); + TORCH_CHECK( + nv12Frame != nullptr && nv12Frame->data[0] != nullptr, + "nv12Frame must be pre-allocated with CUDA memory"); + + // Convert CHW to HWC for NPP processing + int height = static_cast(rgbTensor.size(1)); + int width = static_cast(rgbTensor.size(2)); + torch::Tensor hwcFrame = rgbTensor.permute({1, 2, 0}).contiguous(); + + // Set up stream synchronization - make NPP stream wait for input tensor + // operations + at::cuda::CUDAStream nppStream = + at::cuda::getCurrentCUDAStream(device.index()); + at::cuda::CUDAEvent inputDoneEvent; + inputDoneEvent.record(inputStream); + inputDoneEvent.block(nppStream); + + // Setup NPP context + nppCtx->hStream = nppStream.stream(); + cudaError_t cudaErr = + cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags); + TORCH_CHECK( + cudaErr == cudaSuccess, + "cudaStreamGetFlags failed: ", + cudaGetErrorString(cudaErr)); + + // Always use FFmpeg's default behavior: BT.601 limited range + NppiSize oSizeROI = {width, height}; + + NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( + static_cast(hwcFrame.data_ptr()), + hwcFrame.stride(0) * hwcFrame.element_size(), + nv12Frame->data, + nv12Frame->linesize, + oSizeROI, + defaultLimitedRangeRgbToNv12, + *nppCtx); + + TORCH_CHECK( + status == NPP_SUCCESS, + "Failed to convert RGB to NV12: NPP error code ", + status); + + // Validate CUDA operations completed successfully + cudaError_t memCheck = cudaGetLastError(); + TORCH_CHECK( + memCheck == cudaSuccess, + "CUDA error detected: ", + cudaGetErrorString(memCheck)); +} + UniqueNppContext getNppStreamContext(const torch::Device& device) { int deviceIndex = getDeviceIndex(device); diff --git a/src/torchcodec/_core/CUDACommon.h b/src/torchcodec/_core/CUDACommon.h index 4cc27c23b..15502540f 100644 --- a/src/torchcodec/_core/CUDACommon.h +++ b/src/torchcodec/_core/CUDACommon.h @@ -37,6 +37,13 @@ torch::Tensor convertNV12FrameToRGB( at::cuda::CUDAStream nvdecStream, std::optional preAllocatedOutputTensor = std::nullopt); +void convertRGBTensorToNV12Frame( + const torch::Tensor& rgbTensor, + UniqueAVFrame& nv12Frame, + const torch::Device& device, + const UniqueNppContext& nppCtx, + at::cuda::CUDAStream inputStream); + UniqueNppContext getNppStreamContext(const torch::Device& device); void returnNppStreamContextToCache( const torch::Device& device, diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp index 70f46b7e4..d7e58cb45 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.cpp +++ b/src/torchcodec/_core/CpuDeviceInterface.cpp @@ -429,6 +429,84 @@ std::optional CpuDeviceInterface::maybeFlushAudioBuffers() { /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples); } +UniqueAVFrame CpuDeviceInterface::convertTensorToAVFrame( + const torch::Tensor& frame, + AVPixelFormat outPixelFormat, + int frameIndex, + [[maybe_unused]] AVCodecContext* codecContext) { + int inHeight = static_cast(frame.sizes()[1]); + int inWidth = static_cast(frame.sizes()[2]); + + // For now, reuse input dimensions as output dimensions + int outWidth = inWidth; + int outHeight = inHeight; + + // Input format is RGB planar (AV_PIX_FMT_GBRP after channel reordering) + AVPixelFormat inPixelFormat = AV_PIX_FMT_GBRP; + + // Initialize and cache scaling context if it does not exist + if (!swsContext_) { + swsContext_.reset(sws_getContext( + inWidth, + inHeight, + inPixelFormat, + outWidth, + outHeight, + outPixelFormat, + SWS_BICUBIC, // Used by FFmpeg CLI + nullptr, + nullptr, + nullptr)); + TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context"); + } + + UniqueAVFrame avFrame(av_frame_alloc()); + TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame"); + + // Set output frame properties + avFrame->format = outPixelFormat; + avFrame->width = outWidth; + avFrame->height = outHeight; + avFrame->pts = frameIndex; + + int status = av_frame_get_buffer(avFrame.get(), 0); + TORCH_CHECK(status >= 0, "Failed to allocate frame buffer"); + + // Need to convert/scale the frame + // Create temporary frame with input format + UniqueAVFrame inputFrame(av_frame_alloc()); + TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame"); + + inputFrame->format = inPixelFormat; + inputFrame->width = inWidth; + inputFrame->height = inHeight; + + uint8_t* tensorData = static_cast(frame.data_ptr()); + + // TODO-VideoEncoder: Reorder tensor if in NHWC format + int channelSize = inHeight * inWidth; + // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format + // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format + inputFrame->data[0] = tensorData + channelSize; + inputFrame->data[1] = tensorData + (2 * channelSize); + inputFrame->data[2] = tensorData; + + inputFrame->linesize[0] = inWidth; + inputFrame->linesize[1] = inWidth; + inputFrame->linesize[2] = inWidth; + + status = sws_scale( + swsContext_.get(), + inputFrame->data, + inputFrame->linesize, + 0, + inputFrame->height, + avFrame->data, + avFrame->linesize); + TORCH_CHECK(status == outHeight, "sws_scale failed"); + return avFrame; +} + std::string CpuDeviceInterface::getDetails() { return std::string("CPU Device Interface."); } diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h index 801b83826..c33d5d051 100644 --- a/src/torchcodec/_core/CpuDeviceInterface.h +++ b/src/torchcodec/_core/CpuDeviceInterface.h @@ -18,11 +18,6 @@ class CpuDeviceInterface : public DeviceInterface { virtual ~CpuDeviceInterface() {} - std::optional findCodec( - [[maybe_unused]] const AVCodecID& codecId) override { - return std::nullopt; - } - virtual void initialize( const AVStream* avStream, const UniqueDecodingAVFormatContext& avFormatCtx, @@ -43,6 +38,12 @@ class CpuDeviceInterface : public DeviceInterface { FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) override; + UniqueAVFrame convertTensorToAVFrame( + const torch::Tensor& tensor, + AVPixelFormat targetFormat, + int frameIndex, + AVCodecContext* codecContext) override; + std::string getDetails() override; private: diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 0e20c5e8d..84cefc142 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -1,8 +1,10 @@ #include #include +#include #include #include +#include "CUDACommon.h" #include "Cache.h" #include "CudaDeviceInterface.h" #include "FFMPEGCommon.h" @@ -144,6 +146,40 @@ void CudaDeviceInterface::registerHardwareDeviceWithCodec( codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get()); } +void CudaDeviceInterface::setupEncodingContext(AVCodecContext* codecContext) { + TORCH_CHECK( + hardwareDeviceCtx_, "Hardware device context has not been initialized"); + TORCH_CHECK(codecContext != nullptr, "codecContext is null"); + // is there any way to preserve actual desired format? + // codecContext->sw_pix_fmt = codecContext->pix_fmt; + // Should we always produce AV_PIX_FMT_NV12? + codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; + codecContext->pix_fmt = AV_PIX_FMT_CUDA; + + AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get()); + TORCH_CHECK( + hwFramesCtxRef != nullptr, + "Failed to allocate hardware frames context for codec"); + + AVHWFramesContext* hwFramesCtx = + reinterpret_cast(hwFramesCtxRef->data); + hwFramesCtx->format = codecContext->pix_fmt; + hwFramesCtx->sw_format = codecContext->sw_pix_fmt; + hwFramesCtx->width = codecContext->width; + hwFramesCtx->height = codecContext->height; + + int ret = av_hwframe_ctx_init(hwFramesCtxRef); + if (ret < 0) { + av_buffer_unref(&hwFramesCtxRef); + TORCH_CHECK( + false, + "Failed to initialize CUDA frames context for codec: ", + getFFMPEGErrorStringFromErrorCode(ret)); + } + + codecContext->hw_frames_ctx = hwFramesCtxRef; +} + UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24( UniqueAVFrame& avFrame) { // We need FFmpeg filters to handle those conversion cases which are not @@ -329,11 +365,40 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor); } +namespace { +// Helper function to check if a codec supports CUDA hardware acceleration +bool codecSupportsCudaHardware(const AVCodec* codec) { + const AVCodecHWConfig* config = nullptr; + for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; ++j) { + if (config->device_type == AV_HWDEVICE_TYPE_CUDA) { + return true; + } + } + return false; +} +} // namespace + // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 // we have to do this because of an FFmpeg bug where hardware decoding is not // appropriately set, so we just go off and find the matching codec for the CUDA // device -std::optional CudaDeviceInterface::findCodec( + +std::optional CudaDeviceInterface::findEncoder( + const AVCodecID& codecId) { + void* i = nullptr; + const AVCodec* codec = nullptr; + while ((codec = av_codec_iterate(&i)) != nullptr) { + if (codec->id != codecId || !av_codec_is_encoder(codec)) { + continue; + } + if (codecSupportsCudaHardware(codec)) { + return codec; + } + } + return std::nullopt; +} + +std::optional CudaDeviceInterface::findDecoder( const AVCodecID& codecId) { void* i = nullptr; const AVCodec* codec = nullptr; @@ -342,18 +407,52 @@ std::optional CudaDeviceInterface::findCodec( continue; } - const AVCodecHWConfig* config = nullptr; - for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; - ++j) { - if (config->device_type == AV_HWDEVICE_TYPE_CUDA) { - return codec; - } + if (codecSupportsCudaHardware(codec)) { + return codec; } } return std::nullopt; } +UniqueAVFrame CudaDeviceInterface::convertTensorToAVFrame( + const torch::Tensor& frame, + [[maybe_unused]] AVPixelFormat targetFormat, + int frameIndex, + AVCodecContext* codecContext) { + TORCH_CHECK(frame.is_cuda(), "CUDA device interface requires CUDA tensors"); + TORCH_CHECK( + frame.dim() == 3 && frame.size(0) == 3, + "Expected 3D RGB tensor (CHW format), got shape: ", + frame.sizes()); + + UniqueAVFrame avFrame(av_frame_alloc()); + TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame"); + + avFrame->format = AV_PIX_FMT_CUDA; + avFrame->width = static_cast(frame.size(2)); + avFrame->height = static_cast(frame.size(1)); + avFrame->pts = frameIndex; + + int ret = av_hwframe_get_buffer( + codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0); + TORCH_CHECK( + ret >= 0, + "Failed to allocate hardware frame: ", + getFFMPEGErrorStringFromErrorCode(ret)); + + at::cuda::CUDAStream currentStream = + at::cuda::getCurrentCUDAStream(device_.index()); + + convertRGBTensorToNV12Frame(frame, avFrame, device_, nppCtx_, currentStream); + + // Set color properties to FFmpeg defaults + avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 + avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range + + return avFrame; +} + std::string CudaDeviceInterface::getDetails() { // Note: for this interface specifically the fallback is only known after a // frame has been decoded, not before: that's when FFmpeg decides to fallback, diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h index c892bd49b..83761020c 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.h +++ b/src/torchcodec/_core/CudaDeviceInterface.h @@ -18,7 +18,8 @@ class CudaDeviceInterface : public DeviceInterface { virtual ~CudaDeviceInterface(); - std::optional findCodec(const AVCodecID& codecId) override; + std::optional findEncoder(const AVCodecID& codecId) override; + std::optional findDecoder(const AVCodecID& codecId) override; void initialize( const AVStream* avStream, @@ -34,11 +35,19 @@ class CudaDeviceInterface : public DeviceInterface { void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override; + void setupEncodingContext(AVCodecContext* codecContext) override; + void convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor) override; + UniqueAVFrame convertTensorToAVFrame( + const torch::Tensor& tensor, + AVPixelFormat targetFormat, + int frameIndex, + AVCodecContext* codecContext) override; + std::string getDetails() override; private: diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h index 319fe01a8..2b69dbfc9 100644 --- a/src/torchcodec/_core/DeviceInterface.h +++ b/src/torchcodec/_core/DeviceInterface.h @@ -46,7 +46,12 @@ class DeviceInterface { return device_; }; - virtual std::optional findCodec( + virtual std::optional findEncoder( + [[maybe_unused]] const AVCodecID& codecId) { + return std::nullopt; + }; + + virtual std::optional findDecoder( [[maybe_unused]] const AVCodecID& codecId) { return std::nullopt; }; @@ -87,11 +92,25 @@ class DeviceInterface { virtual void registerHardwareDeviceWithCodec( [[maybe_unused]] AVCodecContext* codecContext) {} + // Setup device-specific encoding context (e.g., hardware frame contexts). + // Called after registerHardwareDeviceWithCodec for encoders. + // Default implementation does nothing (suitable for CPU and basic cases). + virtual void setupEncodingContext( + [[maybe_unused]] AVCodecContext* codecContext) {} + virtual void convertAVFrameToFrameOutput( UniqueAVFrame& avFrame, FrameOutput& frameOutput, std::optional preAllocatedOutputTensor = std::nullopt) = 0; + // Convert tensor to AVFrame, implemented per device interface. + // This is similar to convertAVFrameToFrameOutput for encoding + virtual UniqueAVFrame convertTensorToAVFrame( + const torch::Tensor& tensor, + AVPixelFormat targetFormat, + int frameIndex, + AVCodecContext* codecContext) = 0; + // ------------------------------------------ // Extension points for custom decoding paths // ------------------------------------------ diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index 19ac9220d..4e6fde742 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -5,6 +5,7 @@ #include "torch/types.h" extern "C" { +#include #include #include } @@ -523,7 +524,9 @@ void AudioEncoder::flushBuffers() { namespace { -torch::Tensor validateFrames(const torch::Tensor& frames) { +torch::Tensor validateFrames( + const torch::Tensor& frames, + const torch::Device& device) { TORCH_CHECK( frames.dtype() == torch::kUInt8, "frames must have uint8 dtype, got ", @@ -536,6 +539,15 @@ torch::Tensor validateFrames(const torch::Tensor& frames) { frames.sizes()[1] == 3, "frame must have 3 channels (R, G, B), got ", frames.sizes()[1]); + if (device.type() != torch::kCPU) { + TORCH_CHECK( + frames.is_cuda(), + "When using CUDA encoding (device=", + device.str(), + "), frames must be on a CUDA device. Got frames on ", + frames.device().str(), + ". Please move frames to a CUDA device: frames.to('cuda')"); + } return frames.contiguous(); } @@ -665,7 +677,8 @@ VideoEncoder::VideoEncoder( double frameRate, std::string_view fileName, const VideoStreamOptions& videoStreamOptions) - : frames_(validateFrames(frames)), inFrameRate_(frameRate) { + : frames_(validateFrames(frames, videoStreamOptions.device)), + inFrameRate_(frameRate) { setFFmpegLogLevel(); // Allocate output format context @@ -698,7 +711,7 @@ VideoEncoder::VideoEncoder( std::string_view formatName, std::unique_ptr avioContextHolder, const VideoStreamOptions& videoStreamOptions) - : frames_(validateFrames(frames)), + : frames_(validateFrames(frames, videoStreamOptions.device)), inFrameRate_(frameRate), avioContextHolder_(std::move(avioContextHolder)) { setFFmpegLogLevel(); @@ -724,6 +737,12 @@ VideoEncoder::VideoEncoder( void VideoEncoder::initializeEncoder( const VideoStreamOptions& videoStreamOptions) { + deviceInterface_ = createDeviceInterface( + videoStreamOptions.device, videoStreamOptions.deviceVariant); + TORCH_CHECK( + deviceInterface_ != nullptr, + "Failed to create device interface. This should never happen, please report."); + const AVCodec* avCodec = nullptr; // If codec arg is provided, find codec using logic similar to FFmpeg: // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835 @@ -748,7 +767,13 @@ void VideoEncoder::initializeEncoder( TORCH_CHECK( avFormatContext_->oformat != nullptr, "Output format is null, unable to find default codec."); + // Try to find a hardware-accelerated encoder if not using CPU avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec); + if (videoStreamOptions.device.type() != torch::kCPU) { + avCodec = + deviceInterface_->findEncoder(avFormatContext_->oformat->video_codec) + .value_or(avCodec); + } TORCH_CHECK(avCodec != nullptr, "Video codec not found"); } @@ -820,6 +845,14 @@ void VideoEncoder::initializeEncoder( videoStreamOptions.preset.value().c_str(), 0); } + + // Register the hardware device context with the codec + // context before calling avcodec_open2(). + deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get()); + + // Setup device-specific encoding context (e.g., hardware frame contexts) + deviceInterface_->setupEncodingContext(avCodecContext_.get()); + int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions); av_dict_free(&avCodecOptions); @@ -860,7 +893,8 @@ void VideoEncoder::encode() { int numFrames = static_cast(frames_.sizes()[0]); for (int i = 0; i < numFrames; ++i) { torch::Tensor currFrame = frames_[i]; - UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i); + UniqueAVFrame avFrame = deviceInterface_->convertTensorToAVFrame( + currFrame, outPixelFormat_, i, avCodecContext_.get()); encodeFrame(autoAVPacket, avFrame); } @@ -873,72 +907,6 @@ void VideoEncoder::encode() { getFFMPEGErrorStringFromErrorCode(status)); } -UniqueAVFrame VideoEncoder::convertTensorToAVFrame( - const torch::Tensor& frame, - int frameIndex) { - // Initialize and cache scaling context if it does not exist - if (!swsContext_) { - swsContext_.reset(sws_getContext( - inWidth_, - inHeight_, - inPixelFormat_, - outWidth_, - outHeight_, - outPixelFormat_, - SWS_BICUBIC, // Used by FFmpeg CLI - nullptr, - nullptr, - nullptr)); - TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context"); - } - - UniqueAVFrame avFrame(av_frame_alloc()); - TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame"); - - // Set output frame properties - avFrame->format = outPixelFormat_; - avFrame->width = outWidth_; - avFrame->height = outHeight_; - avFrame->pts = frameIndex; - - int status = av_frame_get_buffer(avFrame.get(), 0); - TORCH_CHECK(status >= 0, "Failed to allocate frame buffer"); - - // Need to convert/scale the frame - // Create temporary frame with input format - UniqueAVFrame inputFrame(av_frame_alloc()); - TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame"); - - inputFrame->format = inPixelFormat_; - inputFrame->width = inWidth_; - inputFrame->height = inHeight_; - - uint8_t* tensorData = static_cast(frame.data_ptr()); - - // TODO-VideoEncoder: Reorder tensor if in NHWC format - int channelSize = inHeight_ * inWidth_; - // Reorder RGB -> GBR for AV_PIX_FMT_GBRP format - // TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format - inputFrame->data[0] = tensorData + channelSize; - inputFrame->data[1] = tensorData + (2 * channelSize); - inputFrame->data[2] = tensorData; - - inputFrame->linesize[0] = inWidth_; - inputFrame->linesize[1] = inWidth_; - inputFrame->linesize[2] = inWidth_; - - status = sws_scale( - swsContext_.get(), - inputFrame->data, - inputFrame->linesize, - 0, - inputFrame->height, - avFrame->data, - avFrame->linesize); - TORCH_CHECK(status == outHeight_, "sws_scale failed"); - return avFrame; -} - torch::Tensor VideoEncoder::encodeToTensor() { TORCH_CHECK( avioContextHolder_ != nullptr, diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h index 1bdc1e443..c32e44943 100644 --- a/src/torchcodec/_core/Encoder.h +++ b/src/torchcodec/_core/Encoder.h @@ -3,6 +3,7 @@ #include #include #include "AVIOContextHolder.h" +#include "DeviceInterface.h" #include "FFMPEGCommon.h" #include "StreamOptions.h" @@ -160,9 +161,6 @@ class VideoEncoder { private: void initializeEncoder(const VideoStreamOptions& videoStreamOptions); - UniqueAVFrame convertTensorToAVFrame( - const torch::Tensor& frame, - int frameIndex); void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame); void flushBuffers(); @@ -183,6 +181,7 @@ class VideoEncoder { AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE; std::unique_ptr avioContextHolder_; + std::unique_ptr deviceInterface_; bool encodeWasCalled_ = false; AVDictionary* avFormatOptions_ = nullptr; diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp index e1b88b36a..14ddd3e6e 100644 --- a/src/torchcodec/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/_core/FFMPEGCommon.cpp @@ -40,7 +40,7 @@ AVPacket* ReferenceAVPacket::operator->() { AVCodecOnlyUseForCallingAVFindBestStream makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) { -#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) +#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100) // FFmpeg < 5.0.3 return const_cast(codec); #else return codec; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index ac7489bbe..dd1d9cbb3 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -462,7 +462,7 @@ void SingleStreamDecoder::addStream( // addStream() which is supposed to be generic if (mediaType == AVMEDIA_TYPE_VIDEO) { avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream( - deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id) + deviceInterface_->findDecoder(streamInfo.stream->codecpar->codec_id) .value_or(avCodec)); } diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 4ec72974d..750b52ae8 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) { m.def( "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()"); m.def( - "encode_video_to_file(Tensor frames, float frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()"); + "encode_video_to_file(Tensor frames, float frame_rate, str filename, str device=\"cpu\", str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()"); m.def( - "encode_video_to_tensor(Tensor frames, float frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor"); + "encode_video_to_tensor(Tensor frames, float frame_rate, str format, str device=\"cpu\", str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor"); m.def( - "_encode_video_to_file_like(Tensor frames, float frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()"); + "_encode_video_to_file_like(Tensor frames, float frame_rate, str format, int file_like_context, str device=\"cpu\",str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()"); m.def( "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); m.def( @@ -415,7 +415,6 @@ void _add_video_stream( } validateDeviceInterface(std::string(device), std::string(device_variant)); - videoStreamOptions.device = torch::Device(std::string(device)); videoStreamOptions.deviceVariant = device_variant; @@ -641,12 +640,14 @@ void encode_video_to_file( const at::Tensor& frames, double frame_rate, std::string_view file_name, + std::string_view device = "cpu", std::optional codec = std::nullopt, std::optional pixel_format = std::nullopt, std::optional crf = std::nullopt, std::optional preset = std::nullopt, std::optional> extra_options = std::nullopt) { VideoStreamOptions videoStreamOptions; + videoStreamOptions.device = torch::Device(std::string(device)); videoStreamOptions.codec = std::move(codec); videoStreamOptions.pixelFormat = std::move(pixel_format); videoStreamOptions.crf = crf; @@ -664,6 +665,7 @@ at::Tensor encode_video_to_tensor( const at::Tensor& frames, double frame_rate, std::string_view format, + std::string_view device = "cpu", std::optional codec = std::nullopt, std::optional pixel_format = std::nullopt, std::optional crf = std::nullopt, @@ -671,6 +673,7 @@ at::Tensor encode_video_to_tensor( std::optional> extra_options = std::nullopt) { auto avioContextHolder = std::make_unique(); VideoStreamOptions videoStreamOptions; + videoStreamOptions.device = torch::Device(std::string(device)); videoStreamOptions.codec = std::move(codec); videoStreamOptions.pixelFormat = std::move(pixel_format); videoStreamOptions.crf = crf; @@ -695,6 +698,7 @@ void _encode_video_to_file_like( double frame_rate, std::string_view format, int64_t file_like_context, + std::string_view device = "cpu", std::optional codec = std::nullopt, std::optional pixel_format = std::nullopt, std::optional crf = std::nullopt, @@ -707,6 +711,7 @@ void _encode_video_to_file_like( std::unique_ptr avioContextHolder(fileLikeContext); VideoStreamOptions videoStreamOptions; + videoStreamOptions.device = torch::Device(std::string(device)); videoStreamOptions.codec = std::move(codec); videoStreamOptions.pixelFormat = std::move(pixel_format); videoStreamOptions.crf = crf; @@ -1019,6 +1024,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) { m.impl("_create_from_file_like", &_create_from_file_like); m.impl( "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions); + m.impl("encode_video_to_file", &encode_video_to_file); + m.impl("encode_video_to_tensor", &encode_video_to_tensor); + m.impl("_encode_video_to_file_like", &_encode_video_to_file_like); } TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) { diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 160e273bb..c25031cc2 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -213,6 +213,7 @@ def encode_video_to_file_like( frame_rate: float, format: str, file_like: Union[io.RawIOBase, io.BufferedIOBase], + device: Optional[str] = "cpu", codec: Optional[str] = None, pixel_format: Optional[str] = None, crf: Optional[Union[int, float]] = None, @@ -226,6 +227,7 @@ def encode_video_to_file_like( frame_rate: Frame rate in frames per second format: Video format (e.g., "mp4", "mov", "mkv") file_like: File-like object that supports write() and seek() methods + device: Device to use for encoding (default: "cpu") codec: Optional codec name (e.g., "libx264", "h264") pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p") crf: Optional constant rate factor for encoding quality @@ -239,6 +241,7 @@ def encode_video_to_file_like( frame_rate, format, _pybind_ops.create_file_like_context(file_like, True), # True means for writing + device, codec, pixel_format, crf, @@ -331,6 +334,7 @@ def encode_video_to_file_abstract( frames: torch.Tensor, frame_rate: float, filename: str, + device: str = "cpu", codec: Optional[str] = None, pixel_format: Optional[str] = None, preset: Optional[str] = None, @@ -345,6 +349,7 @@ def encode_video_to_tensor_abstract( frames: torch.Tensor, frame_rate: float, format: str, + device: str = "cpu", codec: Optional[str] = None, pixel_format: Optional[str] = None, preset: Optional[str] = None, @@ -360,6 +365,7 @@ def _encode_video_to_file_like_abstract( frame_rate: float, format: str, file_like_context: int, + device: str = "cpu", codec: Optional[str] = None, pixel_format: Optional[str] = None, preset: Optional[str] = None, diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py index 49ece70b6..8a1a7bc6d 100644 --- a/src/torchcodec/encoders/_video_encoder.py +++ b/src/torchcodec/encoders/_video_encoder.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional, Union import torch -from torch import Tensor +from torch import device as torch_device, Tensor from torchcodec import _core @@ -16,9 +16,18 @@ class VideoEncoder: C is 3 channels (RGB), H is height, and W is width. Values must be uint8 in the range ``[0, 255]``. frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate. + device (str or torch.device, optional): The device to use for encoding. Default: "cpu". + If you pass a CUDA device, frames will be encoded on GPU. + Note: The "beta" CUDA backend is not supported for encoding. """ - def __init__(self, frames: Tensor, *, frame_rate: float): + def __init__( + self, + frames: Tensor, + *, + frame_rate: float, + device: Optional[Union[str, torch_device]] = "cpu", + ): torch._C._log_api_usage_once("torchcodec.encoders.VideoEncoder") if not isinstance(frames, Tensor): raise ValueError(f"Expected frames to be a Tensor, got {type(frames) = }.") @@ -29,8 +38,13 @@ def __init__(self, frames: Tensor, *, frame_rate: float): if frame_rate <= 0: raise ValueError(f"{frame_rate = } must be > 0.") + # Validate and store device + if isinstance(device, torch_device): + device = str(device) + self._frames = frames self._frame_rate = frame_rate + self._device = device def to_file( self, @@ -69,6 +83,7 @@ def to_file( frames=self._frames, frame_rate=self._frame_rate, filename=str(dest), + device=self._device, codec=codec, pixel_format=pixel_format, crf=crf, @@ -117,6 +132,7 @@ def to_tensor( frames=self._frames, frame_rate=self._frame_rate, format=format, + device=self._device, codec=codec, pixel_format=pixel_format, crf=crf, @@ -169,6 +185,7 @@ def to_file_like( frame_rate=self._frame_rate, format=format, file_like=file_like, + device=self._device, codec=codec, pixel_format=pixel_format, crf=crf, diff --git a/test/test_encoders.py b/test/test_encoders.py index 543025599..eb3d8193f 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -10,7 +10,6 @@ import pytest import torch from torchcodec.decoders import AudioDecoder, VideoDecoder - from torchcodec.encoders import AudioEncoder, VideoEncoder from .utils import ( @@ -796,15 +795,22 @@ def test_extra_options_errors(self, method, tmp_path, extra_options, error): getattr(encoder, method)(**valid_params, extra_options=extra_options) @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) - def test_contiguity(self, method, tmp_path): + @pytest.mark.parametrize( + "device", ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda)) + ) + def test_contiguity(self, method, tmp_path, device): # Ensure that 2 sets of video frames with the same pixel values are encoded # in the same way, regardless of their memory layout. Here we encode 2 equal # frame tensors, one is contiguous while the other is non-contiguous. - num_frames, channels, height, width = 5, 3, 64, 64 - contiguous_frames = torch.randint( - 0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8 - ).contiguous() + num_frames, channels, height, width = 5, 3, 256, 256 + contiguous_frames = ( + torch.randint( + 0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8 + ) + .contiguous() + .to(device) + ) assert contiguous_frames.is_contiguous() # Permute NCHW to NHWC, then update the memory layout, then permute back @@ -823,16 +829,18 @@ def encode_to_tensor(frames): common_params = dict(crf=0, pixel_format="yuv444p") if method == "to_file": dest = str(tmp_path / "output.mp4") - VideoEncoder(frames, frame_rate=30).to_file(dest=dest, **common_params) + VideoEncoder(frames, frame_rate=30, device=device).to_file( + dest=dest, **common_params + ) with open(dest, "rb") as f: - return torch.frombuffer(f.read(), dtype=torch.uint8) + return torch.frombuffer(f.read(), dtype=torch.uint8).clone() elif method == "to_tensor": - return VideoEncoder(frames, frame_rate=30).to_tensor( + return VideoEncoder(frames, frame_rate=30, device=device).to_tensor( format="mp4", **common_params ) elif method == "to_file_like": file_like = io.BytesIO() - VideoEncoder(frames, frame_rate=30).to_file_like( + VideoEncoder(frames, frame_rate=30, device=device).to_file_like( file_like, format="mp4", **common_params ) return torch.frombuffer(file_like.getvalue(), dtype=torch.uint8) @@ -1259,3 +1267,100 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range assert metadata["profile"].lower() == expected_profile assert metadata["color_space"] == colorspace assert metadata["color_range"] == color_range + + @pytest.mark.needs_cuda + @pytest.mark.skipif(in_fbcode(), reason="ffmpeg CLI not available") + @pytest.mark.parametrize("preset", ("slow", "fast")) + @pytest.mark.parametrize("pixel_format", ("nv12", "yuv420p")) + @pytest.mark.parametrize("format", ("mov", "mp4", "avi", "mkv", "flv")) + @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) + def test_nvenc_against_ffmpeg_cli( + self, tmp_path, preset, pixel_format, format, method + ): + # Encode with FFmpeg CLI using h264_nvenc + device = "cuda" + source_frames = self.decode(TEST_SRC_2_720P.path).data.to(device) + + temp_raw_path = str(tmp_path / "temp_input.raw") + with open(temp_raw_path, "wb") as f: + f.write(source_frames.permute(0, 2, 3, 1).cpu().numpy().tobytes()) + + ffmpeg_encoded_path = str(tmp_path / f"ffmpeg_nvenc_output.{format}") + frame_rate = 30 + + ffmpeg_cmd = [ + "ffmpeg", + "-y", + "-f", + "rawvideo", + "-pix_fmt", + "rgb24", # Input format + "-s", + f"{source_frames.shape[3]}x{source_frames.shape[2]}", + "-r", + str(frame_rate), + "-i", + temp_raw_path, + "-c:v", + "h264_nvenc", # Use NVENC hardware encoder + ] + + ffmpeg_cmd.extend(["-pix_fmt", pixel_format]) # Output format + ffmpeg_cmd.extend(["-preset", preset]) # Use parametrized preset + ffmpeg_cmd.extend(["-qp", "0"]) # Use lossless qp for consistency + ffmpeg_cmd.extend([ffmpeg_encoded_path]) + + # Will this prevent CI from treating test as failed if NVENC is not available? + try: + subprocess.run(ffmpeg_cmd, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + if b"No NVENC capable devices found" in e.stderr: + pytest.skip("NVENC not available on this system") + else: + raise + + encoder = VideoEncoder( + frames=source_frames, frame_rate=frame_rate, device=device + ) + + encoder_extra_options = {"qp": 0} + if method == "to_file": + encoder_output_path = str(tmp_path / f"nvenc_output.{format}") + encoder.to_file( + dest=encoder_output_path, + codec="h264_nvenc", + pixel_format=pixel_format, + preset=preset, + extra_options=encoder_extra_options, + ) + encoder_output = encoder_output_path + elif method == "to_tensor": + encoder_output = encoder.to_tensor( + format=format, + codec="h264_nvenc", + pixel_format=pixel_format, + preset=preset, + extra_options=encoder_extra_options, + ) + elif method == "to_file_like": + file_like = io.BytesIO() + encoder.to_file_like( + file_like=file_like, + format=format, + codec="h264_nvenc", + pixel_format=pixel_format, + preset=preset, + extra_options=encoder_extra_options, + ) + encoder_output = file_like.getvalue() + else: + raise ValueError(f"Unknown method: {method}") + + ffmpeg_frames = self.decode(ffmpeg_encoded_path).data + encoder_frames = self.decode(encoder_output).data + + assert ffmpeg_frames.shape[0] == encoder_frames.shape[0] + for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames): + assert psnr(ff_frame, enc_frame) > 25 + assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=99, atol=10) + assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=95, atol=2)