diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 3ce603c3ed2..e7b003f2014 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -21,6 +21,7 @@ import torchvision.transforms.v2 as transforms from common_utils import ( + assert_close, assert_equal, cache, cpu_and_cuda, @@ -41,7 +42,6 @@ ) from torch import nn -from torch.testing import assert_close from torch.utils._pytree import tree_flatten, tree_map from torch.utils.data import DataLoader, default_collate from torchvision import tv_tensors @@ -3505,6 +3505,9 @@ def test_kernel_video(self): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -3520,16 +3523,36 @@ def test_functional(self, make_input): (F.crop_mask, tv_tensors.Mask), (F.crop_video, tv_tensors.Video), (F.crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) - def test_functional_image_correctness(self, kwargs): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) + def test_functional_image_correctness(self, kwargs, make_input): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = F.crop(image, **kwargs) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) assert_equal(actual, expected) @@ -3548,15 +3571,18 @@ def test_functional_image_correctness(self, kwargs): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, param, value, make_input): - input = make_input(self.INPUT_SIZE) + input_data = make_input(self.INPUT_SIZE) check_sample_input = True if param == "fill": if isinstance(value, (tuple, list)): - if isinstance(input, tv_tensors.Mask): + if isinstance(input_data, tv_tensors.Mask): pytest.skip("F.pad_mask doesn't support non-scalar fill.") else: check_sample_input = False @@ -3565,14 +3591,14 @@ def test_transform(self, param, value, make_input): # 1. size is required # 2. the fill parameter only has an affect if we need padding size=[s + 4 for s in self.INPUT_SIZE], - fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8), + fill=adapt_fill(value, dtype=input_data.dtype if isinstance(input_data, torch.Tensor) else torch.uint8), ) else: kwargs = {param: value} check_transform( transforms.RandomCrop(**kwargs, pad_if_needed=True), - input, + input_data, check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), check_sample_input=check_sample_input, ) @@ -3614,7 +3640,16 @@ def test_transform_pad_if_needed(self): padding_mode=["constant", "edge", "reflect", "symmetric"], ) @pytest.mark.parametrize("seed", list(range(5))) - def test_transform_image_correctness(self, param, value, seed): + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) + def test_transform_image_correctness(self, param, value, seed, make_input): kwargs = {param: value} if param != "size": # 1. size is required @@ -3625,16 +3660,29 @@ def test_transform_image_correctness(self, param, value, seed): transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) - image = make_image(self.INPUT_SIZE) + will_pad = False + if kwargs["size"][0] > self.INPUT_SIZE[0] or kwargs["size"][1] > self.INPUT_SIZE[1]: + will_pad = True + + image = make_input(self.INPUT_SIZE) with freeze_rng_state(): torch.manual_seed(seed) actual = transform(image) torch.manual_seed(seed) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(transform(F.to_pil_image(image))) - assert_equal(actual, expected) + if make_input == make_image_cvcuda and will_pad: + # when padding is applied, CV-CUDA will always fill with zeros + # cannot use assert_equal since it will fail unless random is all zeros + assert_close(actual, expected, rtol=0, atol=get_max_value(image.dtype)) + else: + assert_equal(actual, expected) def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width): affine_matrix = np.array( @@ -4458,6 +4506,9 @@ def test_kernel(self, kernel, make_input): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -4474,9 +4525,16 @@ def test_functional(self, make_input): (F.resized_crop_mask, tv_tensors.Mask), (F.resized_crop_video, tv_tensors.Video), (F.resized_crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._resized_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._resized_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type) @param_value_parametrization( @@ -4493,6 +4551,9 @@ def test_functional_signature(self, kernel, input_type): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, param, value, make_input): @@ -4504,20 +4565,37 @@ def test_transform(self, param, value, make_input): # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) - def test_functional_image_correctness(self, interpolation): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + def test_functional_image_correctness(self, make_input, interpolation): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8) actual = F.resized_crop( image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True ) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image( F.resized_crop( F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation ) ) - torch.testing.assert_close(actual, expected, atol=1, rtol=0) + atol = 1 + if make_input is make_image_cvcuda and interpolation == transforms.InterpolationMode.BICUBIC: + # CV-CUDA BICUBIC differs from PIL ground truth BICUBIC + atol = 10 + assert_close(actual, expected, atol=atol, rtol=0) def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size): new_height, new_width = size @@ -4928,6 +5006,9 @@ def test_kernel_video(self): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_functional(self, make_input): @@ -4943,9 +5024,16 @@ def test_functional(self, make_input): (F.center_crop_mask, tv_tensors.Mask), (F.center_crop_video, tv_tensors.Video), (F.center_crop_keypoints, tv_tensors.KeyPoints), + pytest.param( + F._geometry._center_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, kernel, input_type): + if kernel is F._geometry._center_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type) @pytest.mark.parametrize( @@ -4958,17 +5046,33 @@ def test_functional_signature(self, kernel, input_type): make_segmentation_mask, make_video, make_keypoints, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), ], ) def test_transform(self, make_input): check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE)) @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) - def test_image_correctness(self, output_size, fn): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + def test_image_correctness(self, output_size, make_input, fn): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, output_size) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size)) assert_equal(actual, expected) @@ -6243,7 +6347,15 @@ def wrapper(*args, **kwargs): @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], + [ + make_image_tensor, + make_image_pil, + make_image, + make_video, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], ) @pytest.mark.parametrize("functional", [F.five_crop, F.ten_crop]) def test_functional(self, make_input, functional): @@ -6261,13 +6373,27 @@ def test_functional(self, make_input, functional): (F.five_crop, F._geometry._five_crop_image_pil, PIL.Image.Image), (F.five_crop, F.five_crop_image, tv_tensors.Image), (F.five_crop, F.five_crop_video, tv_tensors.Video), + pytest.param( + F.five_crop, + F._geometry._five_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), (F.ten_crop, F.ten_crop_image, torch.Tensor), (F.ten_crop, F._geometry._ten_crop_image_pil, PIL.Image.Image), (F.ten_crop, F.ten_crop_image, tv_tensors.Image), (F.ten_crop, F.ten_crop_video, tv_tensors.Video), + pytest.param( + F.ten_crop, + F._geometry._ten_crop_image_cvcuda, + None, + marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA"), + ), ], ) def test_functional_signature(self, functional, kernel, input_type): + if kernel is F._geometry._five_crop_image_cvcuda or kernel is F._geometry._ten_crop_image_cvcuda: + input_type = _import_cvcuda().Tensor check_functional_kernel_signature_match(functional, kernel=kernel, input_type=input_type) class _TransformWrapper(nn.Module): @@ -6289,7 +6415,15 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: @pytest.mark.parametrize( "make_input", - [make_image_tensor, make_image_pil, make_image, make_video], + [ + make_image_tensor, + make_image_pil, + make_image, + make_video, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], ) @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) def test_transform(self, make_input, transform_cls): @@ -6307,19 +6441,41 @@ def test_transform_error(self, make_input, transform_cls): with pytest.raises(TypeError, match="not supported"): transform(make_input(self.INPUT_SIZE)) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn", [F.five_crop, transform_cls_to_functional(transforms.FiveCrop)]) - def test_correctness_image_five_crop(self, fn): - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + def test_correctness_image_five_crop(self, make_input, fn): + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, size=self.OUTPUT_SIZE) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.five_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE) assert isinstance(actual, tuple) assert_equal(actual, [F.to_image(e) for e in expected]) + @pytest.mark.parametrize( + "make_input", + [ + make_image, + pytest.param( + make_image_cvcuda, marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="test requires CVCUDA") + ), + ], + ) @pytest.mark.parametrize("fn_or_class", [F.ten_crop, transforms.TenCrop]) @pytest.mark.parametrize("vertical_flip", [False, True]) - def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): + def test_correctness_image_ten_crop(self, make_input, fn_or_class, vertical_flip): if fn_or_class is transforms.TenCrop: fn = transform_cls_to_functional(fn_or_class, size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) kwargs = dict() @@ -6327,9 +6483,13 @@ def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): fn = fn_or_class kwargs = dict(size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) - image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + image = make_input(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") actual = fn(image, **kwargs) + + if make_input is make_image_cvcuda: + image = F.cvcuda_to_tensor(image)[0].cpu() + expected = F.ten_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) assert isinstance(actual, tuple) diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 96166e05e9a..6888e6d41f4 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -139,6 +139,9 @@ class Resize(Transform): _v1_transform_cls = _transforms.Resize + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, size: Union[int, Sequence[int], None], @@ -194,6 +197,9 @@ class CenterCrop(Transform): _v1_transform_cls = _transforms.CenterCrop + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]]): super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -252,6 +258,9 @@ class RandomResizedCrop(Transform): _v1_transform_cls = _transforms.RandomResizedCrop + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, size: Union[int, Sequence[int]], @@ -360,6 +369,9 @@ class FiveCrop(Transform): _v1_transform_cls = _transforms.FiveCrop + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]]) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -404,6 +416,9 @@ class TenCrop(Transform): _v1_transform_cls = _transforms.TenCrop + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None: super().__init__() self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") @@ -811,6 +826,9 @@ class RandomCrop(Transform): _v1_transform_cls = _transforms.RandomCrop + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def _extract_params_for_v1_transform(self) -> dict[str, Any]: params = super()._extract_params_for_v1_transform() @@ -1121,6 +1139,9 @@ class RandomIoUCrop(Transform): Default, 40. """ + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, min_scale: float = 0.3, @@ -1402,6 +1423,9 @@ class RandomResize(Transform): v0.17, for the PIL and Tensor backends to be consistent. """ + if CVCUDA_AVAILABLE: + _transformed_types = Transform._transformed_types + (_is_cvcuda_tensor,) + def __init__( self, min_size: int, diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py index bb6051b4e61..e803aa49c60 100644 --- a/torchvision/transforms/v2/_utils.py +++ b/torchvision/transforms/v2/_utils.py @@ -16,7 +16,7 @@ from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size # noqa: F401 from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor -from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT +from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT, _is_cvcuda_tensor def _setup_number_or_seq(arg: int | float | Sequence[int | float], name: str) -> Sequence[float]: @@ -182,7 +182,7 @@ def query_chw(flat_inputs: list[Any]) -> tuple[int, int, int]: chws = { tuple(get_dimensions(inpt)) for inpt in flat_inputs - if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video)) + if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video, _is_cvcuda_tensor)) } if not chws: raise TypeError("No image or video was found in the sample") @@ -207,6 +207,7 @@ def query_size(flat_inputs: list[Any]) -> tuple[int, int]: tv_tensors.Mask, tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, + _is_cvcuda_tensor, ), ) } diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 0e27218bc89..a78f1c33f2e 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -602,6 +602,32 @@ def resize_video( return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias) +def _resize_image_cvcuda( + image: "cvcuda.Tensor", + size: Optional[list[int]], + interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + max_size: Optional[int] = None, + antialias: Optional[bool] = True, +) -> "cvcuda.Tensor": + # placeholder func for now, will be handled in PR for resize alone + # since placeholder convert to from torch tensor and use resize_image + from ._type_conversion import cvcuda_to_tensor, to_cvcuda_tensor + + return to_cvcuda_tensor( + resize_image( + cvcuda_to_tensor(image), + size=size, + interpolation=interpolation, + max_size=max_size, + antialias=antialias, + ) + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(resize, _import_cvcuda().Tensor)(_resize_image_cvcuda) + + def affine( inpt: torch.Tensor, angle: Union[int, float], @@ -1924,6 +1950,50 @@ def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int return crop_image(video, top, left, height, width) +def _crop_image_cvcuda( + image: "cvcuda.Tensor", + top: int, + left: int, + height: int, + width: int, +) -> "cvcuda.Tensor": + cvcuda = _import_cvcuda() + + image_height, image_width, channels = image.shape[1:] + top_diff = 0 + left_diff = 0 + height_diff = 0 + width_diff = 0 + if top < 0: + top_diff = int(-1 * top) + if left < 0: + left_diff = int(-1 * left) + if top + height > image_height: + height_diff = int(top + height - image_height) + if left + width > image_width: + width_diff = int(left + width - image_width) + if top_diff or left_diff or height_diff or width_diff: + image = cvcuda.copymakeborder( + image, + border_mode=cvcuda.Border.CONSTANT, + border_value=[0.0] * channels, + top=top_diff, + left=left_diff, + bottom=height_diff, + right=width_diff, + ) + top = top + top_diff + left = left + left_diff + return cvcuda.customcrop( + image, + cvcuda.RectI(x=left, y=top, width=width, height=height), + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(crop, _import_cvcuda().Tensor)(_crop_image_cvcuda) + + def perspective( inpt: torch.Tensor, startpoints: Optional[list[list[int]]], @@ -2674,6 +2744,47 @@ def center_crop_video(video: torch.Tensor, output_size: list[int]) -> torch.Tens return center_crop_image(video, output_size) +def _center_crop_image_cvcuda( + image: "cvcuda.Tensor", + output_size: list[int], +) -> "cvcuda.Tensor": + cvcuda = _import_cvcuda() + + crop_height, crop_width = _center_crop_parse_output_size(output_size) + # we only allow cvcuda conversion for 4 ndim, and always use nhwc layout + image_height = image.shape[1] + image_width = image.shape[2] + channels = image.shape[3] + if crop_height > image_height or crop_width > image_width: + padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) + image = cvcuda.copymakeborder( + image, + border_mode=cvcuda.Border.CONSTANT, + border_value=[0.0] * channels, + top=padding_ltrb[1], + left=padding_ltrb[0], + bottom=padding_ltrb[3], + right=padding_ltrb[2], + ) + + image_height = image.shape[1] + image_width = image.shape[2] + + if crop_width == image_width and crop_height == image_height: + return image + + # use customcrop to match crop_image behavior + crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width) + return cvcuda.customcrop( + image, + cvcuda.RectI(x=crop_left, y=crop_top, width=crop_width, height=crop_height), + ) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(center_crop, _import_cvcuda().Tensor)(_center_crop_image_cvcuda) + + def resized_crop( inpt: torch.Tensor, top: int, @@ -2860,6 +2971,24 @@ def resized_crop_video( ) +def _resized_crop_image_cvcuda( + image: "cvcuda.Tensor", + top: int, + left: int, + height: int, + width: int, + size: list[int], + interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + antialias: Optional[bool] = True, +) -> "cvcuda.Tensor": + image = _crop_image_cvcuda(image, top, left, height, width) + return _resize_image_cvcuda(image, size, interpolation=interpolation, antialias=antialias) + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(resized_crop, _import_cvcuda().Tensor)(_resized_crop_image_cvcuda) + + def five_crop( inpt: torch.Tensor, size: list[int] ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: @@ -2933,6 +3062,29 @@ def five_crop_video( return five_crop_image(video, size) +def _five_crop_image_cvcuda( + image: "cvcuda.Tensor", + size: list[int], +) -> tuple["cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor", "cvcuda.Tensor"]: + crop_height, crop_width = _parse_five_crop_size(size) + image_height, image_width = image.shape[1], image.shape[2] + + if crop_width > image_width or crop_height > image_height: + raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}") + + tl = _crop_image_cvcuda(image, 0, 0, crop_height, crop_width) + tr = _crop_image_cvcuda(image, 0, image_width - crop_width, crop_height, crop_width) + bl = _crop_image_cvcuda(image, image_height - crop_height, 0, crop_height, crop_width) + br = _crop_image_cvcuda(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width) + center = _center_crop_image_cvcuda(image, [crop_height, crop_width]) + + return tl, tr, bl, br, center + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(five_crop, _import_cvcuda().Tensor)(_five_crop_image_cvcuda) + + def ten_crop( inpt: torch.Tensor, size: list[int], vertical_flip: bool = False ) -> tuple[ @@ -3028,3 +3180,35 @@ def ten_crop_video( torch.Tensor, ]: return ten_crop_image(video, size, vertical_flip=vertical_flip) + + +def _ten_crop_image_cvcuda( + image: "cvcuda.Tensor", + size: list[int], + vertical_flip: bool = False, +) -> tuple[ + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", + "cvcuda.Tensor", +]: + non_flipped = _five_crop_image_cvcuda(image, size) + + if vertical_flip: + image = _vertical_flip_image_cvcuda(image) + else: + image = _horizontal_flip_image_cvcuda(image) + + flipped = _five_crop_image_cvcuda(image, size) + + return non_flipped + flipped + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(ten_crop, _import_cvcuda().Tensor)(_ten_crop_image_cvcuda) diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index 6b8f19f12f4..af03ad018d4 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -51,6 +51,16 @@ def get_dimensions_video(video: torch.Tensor) -> list[int]: return get_dimensions_image(video) +def get_dimensions_image_cvcuda(image: "cvcuda.Tensor") -> list[int]: + # CV-CUDA tensor is always in NHWC layout + # get_dimensions is CHW + return [image.shape[3], image.shape[1], image.shape[2]] + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(get_dimensions, cvcuda.Tensor)(get_dimensions_image_cvcuda) + + def get_num_channels(inpt: torch.Tensor) -> int: if torch.jit.is_scripting(): return get_num_channels_image(inpt) @@ -87,6 +97,16 @@ def get_num_channels_video(video: torch.Tensor) -> int: get_image_num_channels = get_num_channels +def get_num_channels_image_cvcuda(image: "cvcuda.Tensor") -> int: + # CV-CUDA tensor is always in NHWC layout + # get_num_channels is C + return image.shape[3] + + +if CVCUDA_AVAILABLE: + _register_kernel_internal(get_num_channels, cvcuda.Tensor)(get_num_channels_image_cvcuda) + + def get_size(inpt: torch.Tensor) -> list[int]: if torch.jit.is_scripting(): return get_size_image(inpt) @@ -125,7 +145,7 @@ def get_size_image_cvcuda(image: "cvcuda.Tensor") -> list[int]: if CVCUDA_AVAILABLE: - _get_size_image_cvcuda = _register_kernel_internal(get_size, cvcuda.Tensor)(get_size_image_cvcuda) + _register_kernel_internal(get_size, _import_cvcuda().Tensor)(get_size_image_cvcuda) @_register_kernel_internal(get_size, tv_tensors.Video, tv_tensor_wrapper=False)