Skip to content

Commit b60d50f

Browse files
authored
BETA CUDA interface: Add support for VP9, VP8 and MPEG4 (#929)
1 parent 3fa5271 commit b60d50f

File tree

6 files changed

+124
-42
lines changed

6 files changed

+124
-42
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,24 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
150150
return cudaVideoCodec_HEVC;
151151
case AV_CODEC_ID_AV1:
152152
return cudaVideoCodec_AV1;
153-
// TODONVDEC P0: support more codecs
154-
// case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
155-
// case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
156-
// case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
157-
// case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
153+
case AV_CODEC_ID_VP9:
154+
return cudaVideoCodec_VP9;
155+
case AV_CODEC_ID_VP8:
156+
return cudaVideoCodec_VP8;
157+
case AV_CODEC_ID_MPEG4:
158+
return cudaVideoCodec_MPEG4;
159+
// Formats below are currently not tested, but they should "mostly" work.
160+
// MPEG1 was briefly locally tested and it was ok-ish despite duration being
161+
// off. Since they're far less popular, we keep them disabled by default but
162+
// we can consider enabling them upon user requests.
163+
// case AV_CODEC_ID_MPEG1VIDEO:
164+
// return cudaVideoCodec_MPEG1;
165+
// case AV_CODEC_ID_MPEG2VIDEO:
166+
// return cudaVideoCodec_MPEG2;
167+
// case AV_CODEC_ID_MJPEG:
168+
// return cudaVideoCodec_JPEG;
169+
// case AV_CODEC_ID_VC1:
170+
// return cudaVideoCodec_VC1;
158171
default: {
159172
TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
160173
}
@@ -270,10 +283,17 @@ void BetaCudaDeviceInterface::initializeBSF(
270283
}
271284
break;
272285
}
286+
case AV_CODEC_ID_MPEG4: {
287+
const std::string formatName =
288+
avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
289+
if (formatName == "avi") {
290+
filterName = "mpeg4_unpack_bframes";
291+
}
292+
break;
293+
}
273294

274295
default:
275296
// No bitstream filter needed for other codecs
276-
// TODONVDEC P1 MPEG4 will need one!
277297
break;
278298
}
279299

@@ -512,19 +532,15 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
512532
avFrame->format = AV_PIX_FMT_CUDA;
513533
avFrame->pts = dispInfo.timestamp;
514534

515-
// TODONVDEC P2: We compute the duration based on average frame rate info:
516-
// either from NVCUVID if it's valid, otherwise from FFmpeg as fallback. But
517-
// both of these are based on average frame rate, so if the video has
518-
// variable frame rate, the durations may be off. We should try to see if we
519-
// can set the duration more accurately. Unfortunately it's not given by
520-
// dispInfo. One option would be to set it based on the pts difference between
521-
// consecutive frames, if the next frame is already available.
522-
int frameRateNum = static_cast<int>(videoFormat_.frame_rate.numerator);
523-
int frameRateDen = static_cast<int>(videoFormat_.frame_rate.denominator);
524-
AVRational frameRate = (frameRateNum > 0 && frameRateDen > 0)
525-
? AVRational{frameRateNum, frameRateDen}
526-
: frameRateAvgFromFFmpeg_;
527-
setDuration(avFrame, computeSafeDuration(frameRate, timeBase_));
535+
// TODONVDEC P2: We compute the duration based on average frame rate info, so
536+
// so if the video has variable frame rate, the durations may be off. We
537+
// should try to see if we can set the duration more accurately. Unfortunately
538+
// it's not given by dispInfo. One option would be to set it based on the pts
539+
// difference between consecutive frames, if the next frame is already
540+
// available.
541+
// Note that we used to rely on videoFormat_.frame_rate for this, but that
542+
// proved less accurate than FFmpeg.
543+
setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
528544

529545
// We need to assign the frame colorspace. This is crucial for proper color
530546
// conversion. NVCUVID stores that in the matrix_coefficients field, but

test/resources/testsrc2_mpeg4.avi

777 KB
Binary file not shown.

test/resources/testsrc2_vp8.webm

141 KB
Binary file not shown.

test/resources/testsrc2_vp9.webm

102 KB
Binary file not shown.

test/test_decoders.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,12 @@
4343
SINE_MONO_S32,
4444
SINE_MONO_S32_44100,
4545
SINE_MONO_S32_8000,
46+
supports_approximate_mode,
4647
TEST_SRC_2_720P,
4748
TEST_SRC_2_720P_H265,
49+
TEST_SRC_2_720P_MPEG4,
50+
TEST_SRC_2_720P_VP8,
51+
TEST_SRC_2_720P_VP9,
4852
unsplit_device_str,
4953
)
5054

@@ -588,7 +592,7 @@ def test_get_frame_at_av1(self, device):
588592
return
589593

590594
if device == "cuda" and in_fbcode():
591-
pytest.skip("AV1 decoding on CUDA is not supported internally")
595+
pytest.skip("decoding on CUDA is not supported internally")
592596

593597
decoder = VideoDecoder(AV1_VIDEO.path, device=device)
594598
device, _ = unsplit_device_str(device)
@@ -1432,15 +1436,20 @@ def test_get_frames_at_tensor_indices(self):
14321436
decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int))
14331437
decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float))
14341438

1435-
# TODONVDEC P1 unskip equality assertion checks on FFMpeg4. The comparison
1436-
# checks are failing on very few pixels, e.g.:
1439+
# TODONVDEC P1:
1440+
# - unskip equality assertion checks on FFMpeg4. The comparison
1441+
# checks are failing on very few pixels, e.g.:
14371442
#
1438-
# E Mismatched elements: 648586 / 82944000 (0.8%)
1439-
# E Greatest absolute difference: 164 at index (20, 2, 27, 96)
1440-
# E Greatest relative difference: inf at index (5, 1, 112, 186)
1443+
# E Mismatched elements: 648586 / 82944000 (0.8%)
1444+
# E Greatest absolute difference: 164 at index (20, 2, 27, 96)
1445+
# E Greatest relative difference: inf at index (5, 1, 112, 186)
14411446
#
1442-
# So we're skipping them to unblock for now, but we should call
1443-
# assert_tensor_close_on_at_least or something like that.
1447+
# So we're skipping them to unblock for now, but we should call
1448+
# assert_tensor_close_on_at_least or something like that.
1449+
# - unskip equality assertion checks for MPEG4 asset. The frames are decoded
1450+
# fine, it's the color conversion that's different. The frame from the
1451+
# BETA interface is assumed to be 701 while the one from the default
1452+
# interface is 601.
14441453

14451454
@needs_cuda
14461455
@pytest.mark.parametrize(
@@ -1451,15 +1460,18 @@ def test_get_frames_at_tensor_indices(self):
14511460
BT709_FULL_RANGE,
14521461
TEST_SRC_2_720P_H265,
14531462
AV1_VIDEO,
1463+
TEST_SRC_2_720P_VP9,
1464+
TEST_SRC_2_720P_VP8,
1465+
TEST_SRC_2_720P_MPEG4,
14541466
),
14551467
)
14561468
@pytest.mark.parametrize("contiguous_indices", (True, False))
14571469
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
14581470
def test_beta_cuda_interface_get_frame_at(
14591471
self, asset, contiguous_indices, seek_mode
14601472
):
1461-
if asset == AV1_VIDEO and seek_mode == "approximate":
1462-
pytest.skip("AV1 asset doesn't work with approximate mode")
1473+
if seek_mode == "approximate" and not supports_approximate_mode(asset):
1474+
pytest.skip("asset doesn't work with approximate mode")
14631475

14641476
ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
14651477
beta_decoder = VideoDecoder(
@@ -1476,7 +1488,8 @@ def test_beta_cuda_interface_get_frame_at(
14761488
for frame_index in indices:
14771489
ref_frame = ref_decoder.get_frame_at(frame_index)
14781490
beta_frame = beta_decoder.get_frame_at(frame_index)
1479-
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
1491+
# TODONVDEC P1 see above
1492+
if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
14801493
torch.testing.assert_close(
14811494
beta_frame.data, ref_frame.data, rtol=0, atol=0
14821495
)
@@ -1493,15 +1506,18 @@ def test_beta_cuda_interface_get_frame_at(
14931506
BT709_FULL_RANGE,
14941507
TEST_SRC_2_720P_H265,
14951508
AV1_VIDEO,
1509+
TEST_SRC_2_720P_VP9,
1510+
TEST_SRC_2_720P_VP8,
1511+
TEST_SRC_2_720P_MPEG4,
14961512
),
14971513
)
14981514
@pytest.mark.parametrize("contiguous_indices", (True, False))
14991515
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
15001516
def test_beta_cuda_interface_get_frames_at(
15011517
self, asset, contiguous_indices, seek_mode
15021518
):
1503-
if asset == AV1_VIDEO and seek_mode == "approximate":
1504-
pytest.skip("AV1 asset doesn't work with approximate mode")
1519+
if seek_mode == "approximate" and not supports_approximate_mode(asset):
1520+
pytest.skip("asset doesn't work with approximate mode")
15051521

15061522
ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
15071523
beta_decoder = VideoDecoder(
@@ -1518,7 +1534,8 @@ def test_beta_cuda_interface_get_frames_at(
15181534

15191535
ref_frames = ref_decoder.get_frames_at(indices)
15201536
beta_frames = beta_decoder.get_frames_at(indices)
1521-
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
1537+
# TODONVDEC P1 see above
1538+
if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
15221539
torch.testing.assert_close(
15231540
beta_frames.data, ref_frames.data, rtol=0, atol=0
15241541
)
@@ -1536,12 +1553,15 @@ def test_beta_cuda_interface_get_frames_at(
15361553
BT709_FULL_RANGE,
15371554
TEST_SRC_2_720P_H265,
15381555
AV1_VIDEO,
1556+
TEST_SRC_2_720P_VP9,
1557+
TEST_SRC_2_720P_VP8,
1558+
TEST_SRC_2_720P_MPEG4,
15391559
),
15401560
)
15411561
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
15421562
def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
1543-
if asset == AV1_VIDEO and seek_mode == "approximate":
1544-
pytest.skip("AV1 asset doesn't work with approximate mode")
1563+
if seek_mode == "approximate" and not supports_approximate_mode(asset):
1564+
pytest.skip("asset doesn't work with approximate mode")
15451565

15461566
ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
15471567
beta_decoder = VideoDecoder(
@@ -1556,7 +1576,8 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
15561576
for pts in timestamps:
15571577
ref_frame = ref_decoder.get_frame_played_at(pts)
15581578
beta_frame = beta_decoder.get_frame_played_at(pts)
1559-
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
1579+
# TODONVDEC P1 see above
1580+
if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
15601581
torch.testing.assert_close(
15611582
beta_frame.data, ref_frame.data, rtol=0, atol=0
15621583
)
@@ -1573,12 +1594,15 @@ def test_beta_cuda_interface_get_frame_played_at(self, asset, seek_mode):
15731594
BT709_FULL_RANGE,
15741595
TEST_SRC_2_720P_H265,
15751596
AV1_VIDEO,
1597+
TEST_SRC_2_720P_VP9,
1598+
TEST_SRC_2_720P_VP8,
1599+
TEST_SRC_2_720P_MPEG4,
15761600
),
15771601
)
15781602
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
15791603
def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
1580-
if asset == AV1_VIDEO and seek_mode == "approximate":
1581-
pytest.skip("AV1 asset doesn't work with approximate mode")
1604+
if seek_mode == "approximate" and not supports_approximate_mode(asset):
1605+
pytest.skip("asset doesn't work with approximate mode")
15821606

15831607
ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
15841608
beta_decoder = VideoDecoder(
@@ -1593,7 +1617,8 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
15931617

15941618
ref_frames = ref_decoder.get_frames_played_at(timestamps)
15951619
beta_frames = beta_decoder.get_frames_played_at(timestamps)
1596-
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
1620+
# TODONVDEC P1 see above
1621+
if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
15971622
torch.testing.assert_close(
15981623
beta_frames.data, ref_frames.data, rtol=0, atol=0
15991624
)
@@ -1611,12 +1636,15 @@ def test_beta_cuda_interface_get_frames_played_at(self, asset, seek_mode):
16111636
BT709_FULL_RANGE,
16121637
TEST_SRC_2_720P_H265,
16131638
AV1_VIDEO,
1639+
TEST_SRC_2_720P_VP9,
1640+
TEST_SRC_2_720P_VP8,
1641+
TEST_SRC_2_720P_MPEG4,
16141642
),
16151643
)
16161644
@pytest.mark.parametrize("seek_mode", ("exact", "approximate"))
16171645
def test_beta_cuda_interface_backwards(self, asset, seek_mode):
1618-
if asset == AV1_VIDEO and seek_mode == "approximate":
1619-
pytest.skip("AV1 asset doesn't work with approximate mode")
1646+
if seek_mode == "approximate" and not supports_approximate_mode(asset):
1647+
pytest.skip("asset doesn't work with approximate mode")
16201648

16211649
ref_decoder = VideoDecoder(asset.path, device="cuda", seek_mode=seek_mode)
16221650
beta_decoder = VideoDecoder(
@@ -1635,7 +1663,8 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
16351663

16361664
ref_frame = ref_decoder.get_frame_at(frame_index)
16371665
beta_frame = beta_decoder.get_frame_at(frame_index)
1638-
if get_ffmpeg_major_version() > 4: # TODONVDEC P1 see above
1666+
# TODONVDEC P1 see above
1667+
if get_ffmpeg_major_version() > 4 and asset is not TEST_SRC_2_720P_MPEG4:
16391668
torch.testing.assert_close(
16401669
beta_frame.data, ref_frame.data, rtol=0, atol=0
16411670
)

test/utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,40 @@ def sample_format(self) -> str:
717717
},
718718
frames={0: {}}, # Not needed for now
719719
)
720+
721+
# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx-vp9 -b:v 1M output_vp9.webm
722+
TEST_SRC_2_720P_VP9 = TestVideo(
723+
filename="testsrc2_vp9.webm",
724+
default_stream_index=0,
725+
stream_infos={
726+
0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
727+
},
728+
frames={0: {}}, # Not needed for now
729+
)
730+
731+
# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v libvpx -b:v 1M output_vp8.webm
732+
TEST_SRC_2_720P_VP8 = TestVideo(
733+
filename="testsrc2_vp8.webm",
734+
default_stream_index=0,
735+
stream_infos={
736+
0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
737+
},
738+
frames={0: {}}, # Not needed for now
739+
)
740+
741+
# ffmpeg -f lavfi -i testsrc2=size=1280x720:rate=30:duration=1 -c:v mpeg4 -q:v 5 output_mpeg4.avi
742+
TEST_SRC_2_720P_MPEG4 = TestVideo(
743+
filename="testsrc2_mpeg4.avi",
744+
default_stream_index=0,
745+
stream_infos={
746+
0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3),
747+
},
748+
frames={0: {}}, # Not needed for now
749+
)
750+
751+
752+
def supports_approximate_mode(asset: TestVideo) -> bool:
753+
# TODONVDEC P2: open an issue about his. That's actually not related to
754+
# NVDEC at all, those don't support approximate mode because they don't set
755+
# a duration. CPU decoder fails too!
756+
return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)

0 commit comments

Comments
 (0)