Generalize DeviceInterface to include audio decoding (#1010)

mollyxu · Molly Xu · web-flow · commit 169484d24e63 · 2025-10-31T11:01:13.000-04:00
Co-authored-by: Molly Xu &lt;mollyxu@fb.com&gt;
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -46,8 +46,7 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
-      std::optional<torch::Tensor> preAllocatedOutputTensor =
-          std::nullopt) override;
+      std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -35,6 +35,7 @@ void CpuDeviceInterface::initializeVideo(
     const VideoStreamOptions& videoStreamOptions,
     const std::vector<std::unique_ptr<Transform>>& transforms,
     const std::optional<FrameDims>& resizedOutputDims) {
+  avMediaType_ = AVMEDIA_TYPE_VIDEO;
   videoStreamOptions_ = videoStreamOptions;
   resizedOutputDims_ = resizedOutputDims;
 
@@ -86,6 +87,13 @@ void CpuDeviceInterface::initializeVideo(
   initialized_ = true;
 }
 
+void CpuDeviceInterface::initializeAudio(
+    const AudioStreamOptions& audioStreamOptions) {
+  avMediaType_ = AVMEDIA_TYPE_AUDIO;
+  audioStreamOptions_ = audioStreamOptions;
+  initialized_ = true;
+}
+
 ColorConversionLibrary CpuDeviceInterface::getColorConversionLibrary(
     const FrameDims& outputDims) const {
   // swscale requires widths to be multiples of 32:
@@ -114,6 +122,20 @@ ColorConversionLibrary CpuDeviceInterface::getColorConversionLibrary(
   }
 }
 
+void CpuDeviceInterface::convertAVFrameToFrameOutput(
+    UniqueAVFrame& avFrame,
+    FrameOutput& frameOutput,
+    std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  TORCH_CHECK(initialized_, "CpuDeviceInterface was not initialized.");
+
+  if (avMediaType_ == AVMEDIA_TYPE_AUDIO) {
+    convertAudioAVFrameToFrameOutput(avFrame, frameOutput);
+  } else {
+    convertVideoAVFrameToFrameOutput(
+        avFrame, frameOutput, preAllocatedOutputTensor);
+  }
+}
+
 // Note [preAllocatedOutputTensor with swscale and filtergraph]:
 // Callers may pass a pre-allocated tensor, where the output.data tensor will
 // be stored. This parameter is honored in any case, but it only leads to a
@@ -123,12 +145,10 @@ ColorConversionLibrary CpuDeviceInterface::getColorConversionLibrary(
 // TODO: Figure out whether that's possible!
 // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
 // `dimension_order` parameter. It's up to callers to re-shape it if needed.
-void CpuDeviceInterface::convertAVFrameToFrameOutput(
+void CpuDeviceInterface::convertVideoAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  TORCH_CHECK(initialized_, "CpuDeviceInterface was not initialized.");
-
   // Note that we ignore the dimensions from the metadata; we don't even bother
   // storing them. The resized dimensions take priority. If we don't have any,
   // then we use the dimensions from the actual decoded frame. We use the actual
@@ -278,6 +298,123 @@ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
   return rgbAVFrameToTensor(filterGraph_->convert(avFrame));
 }
 
+void CpuDeviceInterface::convertAudioAVFrameToFrameOutput(
+    UniqueAVFrame& srcAVFrame,
+    FrameOutput& frameOutput) {
+  AVSampleFormat srcSampleFormat =
+      static_cast<AVSampleFormat>(srcAVFrame->format);
+  AVSampleFormat outSampleFormat = AV_SAMPLE_FMT_FLTP;
+
+  int srcSampleRate = srcAVFrame->sample_rate;
+  int outSampleRate = audioStreamOptions_.sampleRate.value_or(srcSampleRate);
+
+  int srcNumChannels = getNumChannels(codecContext_);
+  TORCH_CHECK(
+      srcNumChannels == getNumChannels(srcAVFrame),
+      "The frame has ",
+      getNumChannels(srcAVFrame),
+      " channels, expected ",
+      srcNumChannels,
+      ". If you are hitting this, it may be because you are using "
+      "a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
+      "valid scenarios. Try to upgrade FFmpeg?");
+  int outNumChannels = audioStreamOptions_.numChannels.value_or(srcNumChannels);
+
+  bool mustConvert =
+      (srcSampleFormat != outSampleFormat || srcSampleRate != outSampleRate ||
+       srcNumChannels != outNumChannels);
+
+  UniqueAVFrame convertedAVFrame;
+  if (mustConvert) {
+    if (!swrContext_) {
+      swrContext_.reset(createSwrContext(
+          srcSampleFormat,
+          outSampleFormat,
+          srcSampleRate,
+          outSampleRate,
+          srcAVFrame,
+          outNumChannels));
+    }
+
+    convertedAVFrame = convertAudioAVFrameSamples(
+        swrContext_,
+        srcAVFrame,
+        outSampleFormat,
+        outSampleRate,
+        outNumChannels);
+  }
+  const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
+
+  AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
+  TORCH_CHECK(
+      format == outSampleFormat,
+      "Something went wrong, the frame didn't get converted to the desired format. ",
+      "Desired format = ",
+      av_get_sample_fmt_name(outSampleFormat),
+      "source format = ",
+      av_get_sample_fmt_name(format));
+
+  int numChannels = getNumChannels(avFrame);
+  TORCH_CHECK(
+      numChannels == outNumChannels,
+      "Something went wrong, the frame didn't get converted to the desired ",
+      "number of channels = ",
+      outNumChannels,
+      ". Got ",
+      numChannels,
+      " instead.");
+
+  auto numSamples = avFrame->nb_samples;
+
+  frameOutput.data = torch::empty({numChannels, numSamples}, torch::kFloat32);
+
+  if (numSamples > 0) {
+    uint8_t* outputChannelData =
+        static_cast<uint8_t*>(frameOutput.data.data_ptr());
+    auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
+    for (auto channel = 0; channel < numChannels;
+         ++channel, outputChannelData += numBytesPerChannel) {
+      std::memcpy(
+          outputChannelData,
+          avFrame->extended_data[channel],
+          numBytesPerChannel);
+    }
+  }
+}
+
+std::optional<torch::Tensor> CpuDeviceInterface::maybeFlushAudioBuffers() {
+  // When sample rate conversion is involved, swresample buffers some of the
+  // samples in-between calls to swr_convert (see the libswresample docs).
+  // That's because the last few samples in a given frame require future
+  // samples from the next frame to be properly converted. This function
+  // flushes out the samples that are stored in swresample's buffers.
+  if (!swrContext_) {
+    return std::nullopt;
+  }
+  auto numRemainingSamples = // this is an upper bound
+      swr_get_out_samples(swrContext_.get(), 0);
+
+  if (numRemainingSamples == 0) {
+    return std::nullopt;
+  }
+
+  int numChannels =
+      audioStreamOptions_.numChannels.value_or(getNumChannels(codecContext_));
+  torch::Tensor lastSamples =
+      torch::empty({numChannels, numRemainingSamples}, torch::kFloat32);
+
+  std::vector<uint8_t*> outputBuffers(numChannels);
+  for (auto i = 0; i < numChannels; i++) {
+    outputBuffers[i] = static_cast<uint8_t*>(lastSamples[i].data_ptr());
+  }
+
+  auto actualNumRemainingSamples = swr_convert(
+      swrContext_.get(), outputBuffers.data(), numRemainingSamples, nullptr, 0);
+
+  return lastSamples.narrow(
+      /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
+}
+
 std::string CpuDeviceInterface::getDetails() {
   return std::string("CPU Device Interface.");
 }
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -33,15 +33,28 @@ class CpuDeviceInterface : public DeviceInterface {
       const std::vector<std::unique_ptr<Transform>>& transforms,
       const std::optional<FrameDims>& resizedOutputDims) override;
 
+  virtual void initializeAudio(
+      const AudioStreamOptions& audioStreamOptions) override;
+
+  virtual std::optional<torch::Tensor> maybeFlushAudioBuffers() override;
+
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
-      std::optional<torch::Tensor> preAllocatedOutputTensor =
-          std::nullopt) override;
+      std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
   std::string getDetails() override;
 
  private:
+  void convertAudioAVFrameToFrameOutput(
+      UniqueAVFrame& srcAVFrame,
+      FrameOutput& frameOutput);
+
+  void convertVideoAVFrameToFrameOutput(
+      UniqueAVFrame& avFrame,
+      FrameOutput& frameOutput,
+      std::optional<torch::Tensor> preAllocatedOutputTensor);
+
   int convertAVFrameToTensorUsingSwScale(
       const UniqueAVFrame& avFrame,
       torch::Tensor& outputTensor,
@@ -108,6 +121,10 @@ class CpuDeviceInterface : public DeviceInterface {
   bool userRequestedSwScale_;
 
   bool initialized_ = false;
+
+  // Audio-specific members
+  AudioStreamOptions audioStreamOptions_;
+  UniqueSwrContext swrContext_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -37,8 +37,7 @@ class CudaDeviceInterface : public DeviceInterface {
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
       FrameOutput& frameOutput,
-      std::optional<torch::Tensor> preAllocatedOutputTensor =
-          std::nullopt) override;
+      std::optional<torch::Tensor> preAllocatedOutputTensor) override;
 
   std::string getDetails() override;
 
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -65,6 +65,21 @@ class DeviceInterface {
           transforms,
       [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {}
 
+  // Initialize the device with parameters specific to audio decoding. There is
+  // a default empty implementation.
+  virtual void initializeAudio(
+      [[maybe_unused]] const AudioStreamOptions& audioStreamOptions) {}
+
+  // Flush any remaining samples from the audio resampler buffer.
+  // When sample rate conversion is involved, some samples may be buffered
+  // between frames for proper interpolation. This function flushes those
+  // buffered samples.
+  // Returns an optional tensor containing the flushed samples, or std::nullopt
+  // if there are no buffered samples or audio is not supported.
+  virtual std::optional<torch::Tensor> maybeFlushAudioBuffers() {
+    return std::nullopt;
+  }
+
   // In order for decoding to actually happen on an FFmpeg managed hardware
   // device, we need to register the DeviceInterface managed
   // AVHardwareDeviceContext with the AVCodecContext. We don't need to do this
@@ -126,6 +141,7 @@ class DeviceInterface {
  protected:
   torch::Device device_;
   SharedAVCodecContext codecContext_;
+  AVMediaType avMediaType_;
 };
 
 using CreateDeviceInterfaceFn =
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h