-
Couldn't load subscription status.
- Fork 66
Decoder-native resize public implementation #1003
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,7 @@ | |
| import json | ||
| import numbers | ||
| from pathlib import Path | ||
| from typing import Literal, Optional, Tuple, Union | ||
| from typing import Any, List, Literal, Optional, Tuple, Union | ||
|
|
||
| import torch | ||
| from torch import device as torch_device, Tensor | ||
|
|
@@ -103,6 +103,7 @@ def __init__( | |
| dimension_order: Literal["NCHW", "NHWC"] = "NCHW", | ||
| num_ffmpeg_threads: int = 1, | ||
| device: Optional[Union[str, torch_device]] = "cpu", | ||
| transforms: List[Any] = [], # TRANSFORMS TODO: what is the user-facing type? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussion point 1: If we accept TorchVision transforms, and we want to lazily load TorchVision, what type do we advertise here? We can easily explain that we accept a TorchVision transform in the docs, but what should we put in the type annotation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should probably be either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, that solves the problem nicely: it can definitely be |
||
| seek_mode: Literal["exact", "approximate"] = "exact", | ||
| custom_frame_mappings: Optional[ | ||
| Union[str, bytes, io.RawIOBase, io.BufferedReader] | ||
|
|
@@ -148,13 +149,16 @@ def __init__( | |
|
|
||
| device_variant = _get_cuda_backend() | ||
|
|
||
| transform_specs = make_transform_specs(transforms) | ||
|
|
||
| core.add_video_stream( | ||
| self._decoder, | ||
| stream_index=stream_index, | ||
| dimension_order=dimension_order, | ||
| num_threads=num_ffmpeg_threads, | ||
| device=device, | ||
| device_variant=device_variant, | ||
| transform_specs=transform_specs, | ||
| custom_frame_mappings=custom_frame_mappings_data, | ||
| ) | ||
|
|
||
|
|
@@ -432,6 +436,22 @@ def _get_and_validate_stream_metadata( | |
| ) | ||
|
|
||
|
|
||
| def make_transform_specs(transforms: List[Any]) -> str: | ||
| from torchvision.transforms import v2 | ||
|
|
||
| transform_specs = [] | ||
| for transform in transforms: | ||
| if isinstance(transform, v2.Resize): | ||
| if len(transform.size) != 2: | ||
| raise ValueError( | ||
| f"Resize transform must have a (height, width) pair for the size, got {transform.size}." | ||
| ) | ||
| transform_specs.append(f"resize, {transform.size[0]}, {transform.size[1]}") | ||
| else: | ||
| raise ValueError(f"Unsupported transform {transform}.") | ||
| return ";".join(transform_specs) | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussion point 2: This is what we'll have to do with TorchVision transforms at the moment. We'll need special handling for each transform, looking into its internals to get what we need and enforce decoder-native limitations. In the future, we can change TorchVision transforms to have an API so that we can get what we need in a generic way. But for now, we'll need to do something like this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm still undecided on whether we should accept TV transforms or not (ironic, I know), but I think this is totally OK. And I think we'll need that level of coupling anyway, even if we were to write our own TC transforms. Echoing what you wrote:
Basically, that coupling between TC and TV will have to exist either in the code (as in this PR), or in our heads as API designers. Side note, slightly related: if we're going to have our own TC transforms, I think we'll want their API to exactly match (or be a strict subset of) the TV transforms. E.g. we'd have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @NicolasHug, I came to same conclusion as:
At which point, I don't think we've really gained anything by having them separate. And users will probably also start asking, hey, can you just accept the TorchVision ones? I also just realized a new counter-point, which I'll put up in the summary as counter point 3. |
||
|
|
||
| def _read_custom_frame_mappings( | ||
| custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader] | ||
| ) -> tuple[Tensor, Tensor, Tensor]: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,15 +22,75 @@ | |
| get_json_metadata, | ||
| get_next_frame, | ||
| ) | ||
| from torchcodec.decoders import VideoDecoder | ||
|
|
||
| from torchvision.transforms import v2 | ||
|
|
||
| from .utils import assert_frames_equal, NASA_VIDEO, needs_cuda | ||
| from .utils import assert_frames_equal, NASA_VIDEO, needs_cuda, psnr | ||
|
|
||
| torch._dynamo.config.capture_dynamic_output_shape_ops = True | ||
|
|
||
|
|
||
| class TestVideoDecoderTransformOps: | ||
| class TestPublicVideoDecoderTransformOps: | ||
| @pytest.mark.parametrize( | ||
| "height_scaling_factor, width_scaling_factor", | ||
| ((1.5, 1.31), (0.5, 0.71), (0.7, 1.31), (1.5, 0.71), (1.0, 1.0)), | ||
| ) | ||
| def test_resize_torchvision(self, height_scaling_factor, width_scaling_factor): | ||
| height = int(NASA_VIDEO.get_height() * height_scaling_factor) | ||
| width = int(NASA_VIDEO.get_width() * width_scaling_factor) | ||
|
|
||
| decoder_resize = VideoDecoder( | ||
| NASA_VIDEO.path, transforms=[v2.Resize(size=(height, width))] | ||
| ) | ||
| decoder_full = VideoDecoder(NASA_VIDEO.path) | ||
| for frame_index in [0, 10, 17, 100, 230, 389]: | ||
| expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width) | ||
| frame_resize = decoder_resize[frame_index] | ||
|
|
||
| frame_full = decoder_full[frame_index] | ||
| frame_tv = v2.functional.resize(frame_full, size=(height, width)) | ||
|
|
||
| assert frame_resize.shape == expected_shape | ||
| assert frame_tv.shape == expected_shape | ||
|
|
||
| # Copied from PR #992; not sure if it's the best way to check | ||
| assert psnr(frame_resize, frame_tv) > 25 | ||
|
|
||
| def test_resize_ffmpeg(self): | ||
| height = 135 | ||
| width = 240 | ||
| expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width) | ||
| resize_filtergraph = f"scale={width}:{height}:flags=bilinear" | ||
| decoder_resize = VideoDecoder( | ||
| NASA_VIDEO.path, transforms=[v2.Resize(size=(height, width))] | ||
| ) | ||
| for frame_index in [17, 230, 389]: | ||
| frame_resize = decoder_resize[frame_index] | ||
| frame_ref = NASA_VIDEO.get_frame_data_by_index( | ||
| frame_index, filters=resize_filtergraph | ||
| ) | ||
|
|
||
| assert frame_resize.shape == expected_shape | ||
| assert frame_ref.shape == expected_shape | ||
| assert_frames_equal(frame_resize, frame_ref) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently fails. Still investigating. |
||
|
|
||
| def test_resize_fails(self): | ||
| with pytest.raises( | ||
| ValueError, | ||
| match=r"must have a \(height, width\) pair for the size", | ||
| ): | ||
| VideoDecoder(NASA_VIDEO.path, transforms=[v2.Resize(size=(100))]) | ||
|
|
||
| def test_transform_fails(self): | ||
| with pytest.raises( | ||
| ValueError, | ||
| match="Unsupported transform", | ||
| ): | ||
| VideoDecoder(NASA_VIDEO.path, transforms=[v2.RandomHorizontalFlip(p=1.0)]) | ||
|
|
||
|
|
||
| class TestCoreVideoDecoderTransformOps: | ||
| # We choose arbitrary values for width and height scaling to get better | ||
| # test coverage. Some pairs upscale the image while others downscale it. | ||
| @pytest.mark.parametrize( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From the FFmpeg docs:
Whereas
flagsis the specific parameter toscale. They end up being semantically equivalent, but it's more clear to use the scale option here.