pytorch
diff --git a/‎.github/scripts/setup-env.sh‎
Lines changed: 6 additions & 0 deletions b/‎.github/scripts/setup-env.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 30 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎test/common_utils.py‎
Lines changed: 24 additions & 21 deletions b/‎test/common_utils.py‎
Lines changed: 24 additions & 21 deletions
diff --git a/‎test/test_transforms_v2.py‎
Lines changed: 71 additions & 19 deletions b/‎test/test_transforms_v2.py‎
Lines changed: 71 additions & 19 deletions
diff --git a/‎torchvision/transforms/v2/_geometry.py‎
Lines changed: 9 additions & 1 deletion b/‎torchvision/transforms/v2/_geometry.py‎
Lines changed: 9 additions & 1 deletion
@@ -82,6 +82,12 @@ echo '::group::Install TorchVision'
 pip install -e . -v --no-build-isolation
 echo '::endgroup::'
 
+if [[ "${CVCUDA:-}" == "1" ]]; then
+  echo '::group::Install CV-CUDA'
+  pip install --progress-bar=off cvcuda-cu12
+  echo '::endgroup::'
+fi
+
 echo '::group::Install torchvision-extra-decoders'
 # This can be done after torchvision was built
 if [[ "$(uname)" == "Linux" && "$(uname -m)" != "aarch64" ]]; then
 
@@ -45,6 +45,36 @@ jobs:
 
         ./.github/scripts/unittest.sh
 
+  unittests-linux-cvcuda:
+    strategy:
+      matrix:
+        python-version:
+          - "3.10"
+        runner: ["linux.g5.4xlarge.nvidia.gpu"]
+        gpu-arch-type: ["cuda"]
+        gpu-arch-version: ["12.6"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      test-infra-ref: main
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+        export CVCUDA="1"
+
+        ./.github/scripts/unittest.sh
+
   unittests-macos:
     strategy:
       matrix:
 
@@ -21,14 +21,13 @@
 from torchvision import io, tv_tensors
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.v2.functional import cvcuda_to_tensor, to_cvcuda_tensor, to_image, to_pil_image
-from torchvision.transforms.v2.functional._utils import _import_cvcuda, _is_cvcuda_available
+from torchvision.transforms.v2.functional._utils import _is_cvcuda_available, _is_cvcuda_tensor
 from torchvision.utils import _Image_fromarray
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
-CVCUDA_AVAILABLE = _is_cvcuda_available()
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
 MPS_NOT_AVAILABLE_MSG = "MPS device not available"
 OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."
@@ -277,17 +276,6 @@ def combinations_grid(**kwargs):
     return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
 
 
-def cvcuda_to_pil_compatible_tensor(tensor: "cvcuda.Tensor") -> torch.Tensor:
-    tensor = cvcuda_to_tensor(tensor)
-    if tensor.ndim != 4:
-        raise ValueError(f"CV-CUDA Tensor should be 4 dimensional. Got {tensor.ndim} dimensions.")
-    if tensor.shape[0] != 1:
-        raise ValueError(
-            f"CV-CUDA Tensor should have batch dimension 1 for comparison with PIL.Image.Image. Got {tensor.shape[0]}."
-        )
-    return tensor.squeeze(0).cpu()
-
-
 class ImagePair(TensorLikePair):
     def __init__(
         self,
@@ -297,13 +285,24 @@ def __init__(
         mae=False,
         **other_parameters,
     ):
-        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
-            actual, expected = (to_image(input) for input in [actual, expected])
-
-        # handle check for CV-CUDA Tensors
-        if CVCUDA_AVAILABLE and isinstance(actual, _import_cvcuda().Tensor):
-            # Use the PIL compatible tensor, so we can always compare with PIL.Image.Image
-            actual = cvcuda_to_pil_compatible_tensor(actual)
+        # Convert PIL images to tv_tensors.Image (regardless of what the other is)
+        if isinstance(actual, PIL.Image.Image):
+            actual = to_image(actual)
+        if isinstance(expected, PIL.Image.Image):
+            expected = to_image(expected)
+
+        if _is_cvcuda_available():
+            if _is_cvcuda_tensor(actual):
+                actual = cvcuda_to_tensor(actual)
+                # Remove batch dimension if it's 1 for easier comparison against 3D PIL images
+                if actual.shape[0] == 1:
+                    actual = actual[0]
+                actual = actual.cpu()
+            if _is_cvcuda_tensor(expected):
+                expected = cvcuda_to_tensor(expected)
+                if expected.shape[0] == 1:
+                    expected = expected[0]
+                expected = expected.cpu()
 
         super().__init__(actual, expected, **other_parameters)
         self.mae = mae
@@ -559,5 +558,9 @@ def ignore_jit_no_profile_information_warning():
     # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
     # them.
     with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning)
+        warnings.filterwarnings(
+            "ignore",
+            message=re.escape("operator() profile_node %"),
+            category=UserWarning,
+        )
         yield
@@ -1241,6 +1241,10 @@ def test_kernel_video(self):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1256,13 +1260,20 @@ def test_functional(self, make_input):
             (F.horizontal_flip_image, torch.Tensor),
             (F._geometry._horizontal_flip_image_pil, PIL.Image.Image),
             (F.horizontal_flip_image, tv_tensors.Image),
+            pytest.param(
+                F._geometry._horizontal_flip_image_cvcuda,
+                None,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes),
             (F.horizontal_flip_mask, tv_tensors.Mask),
             (F.horizontal_flip_video, tv_tensors.Video),
             (F.horizontal_flip_keypoints, tv_tensors.KeyPoints),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
+        if kernel is F._geometry._horizontal_flip_image_cvcuda:
+            input_type = _import_cvcuda().Tensor
         check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
@@ -1271,6 +1282,10 @@ def test_functional_signature(self, kernel, input_type):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1284,13 +1299,23 @@ def test_transform(self, make_input, device):
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
-    def test_image_correctness(self, fn):
-        image = make_image(dtype=torch.uint8, device="cpu")
-
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
+        ],
+    )
+    def test_image_correctness(self, fn, make_input):
+        image = make_input()
         actual = fn(image)
-        expected = F.to_image(F.horizontal_flip(F.to_pil_image(image)))
-
-        torch.testing.assert_close(actual, expected)
+        if make_input is make_image_cvcuda:
+            image = F.cvcuda_to_tensor(image)[0].cpu()
+        expected = F.horizontal_flip(F.to_pil_image(image))
+        assert_equal(actual, expected)
 
     def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
@@ -1346,6 +1371,10 @@ def test_keypoints_correctness(self, fn):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1355,11 +1384,8 @@ def test_keypoints_correctness(self, fn):
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
         input = make_input(device=device)
-
         transform = transforms.RandomHorizontalFlip(p=0)
-
         output = transform(input)
-
         assert_equal(output, input)
 
 
@@ -1857,6 +1883,10 @@ def test_kernel_video(self):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1872,13 +1902,20 @@ def test_functional(self, make_input):
             (F.vertical_flip_image, torch.Tensor),
             (F._geometry._vertical_flip_image_pil, PIL.Image.Image),
             (F.vertical_flip_image, tv_tensors.Image),
+            pytest.param(
+                F._geometry._vertical_flip_image_cvcuda,
+                None,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes),
             (F.vertical_flip_mask, tv_tensors.Mask),
             (F.vertical_flip_video, tv_tensors.Video),
             (F.vertical_flip_keypoints, tv_tensors.KeyPoints),
         ],
     )
     def test_functional_signature(self, kernel, input_type):
+        if kernel is F._geometry._vertical_flip_image_cvcuda:
+            input_type = _import_cvcuda().Tensor
         check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
@@ -1887,6 +1924,10 @@ def test_functional_signature(self, kernel, input_type):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1898,13 +1939,23 @@ def test_transform(self, make_input, device):
         check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device))
 
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
-    def test_image_correctness(self, fn):
-        image = make_image(dtype=torch.uint8, device="cpu")
-
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
+        ],
+    )
+    def test_image_correctness(self, fn, make_input):
+        image = make_input()
         actual = fn(image)
-        expected = F.to_image(F.vertical_flip(F.to_pil_image(image)))
-
-        torch.testing.assert_close(actual, expected)
+        if make_input is make_image_cvcuda:
+            image = F.cvcuda_to_tensor(image)[0].cpu()
+        expected = F.vertical_flip(F.to_pil_image(image))
+        assert_equal(actual, expected)
 
     def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes):
         affine_matrix = np.array(
@@ -1956,6 +2007,10 @@ def test_keypoints_correctness(self, fn):
             make_image_tensor,
             make_image_pil,
             make_image,
+            pytest.param(
+                make_image_cvcuda,
+                marks=pytest.mark.skipif(not CVCUDA_AVAILABLE, reason="CVCUDA is not available"),
+            ),
             make_bounding_boxes,
             make_segmentation_mask,
             make_video,
@@ -1965,11 +2020,8 @@ def test_keypoints_correctness(self, fn):
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform_noop(self, make_input, device):
         input = make_input(device=device)
-
         transform = transforms.RandomVerticalFlip(p=0)
-
         output = transform(input)
-
         assert_equal(output, input)
 
 
@@ -6826,7 +6878,7 @@ def test_functional_and_transform(self, dtype, device, color_space, batch_dims,
         assert F.get_size(output) == F.get_size(input_tensor)
 
     def test_functional_error(self):
-        with pytest.raises(TypeError, match="cvcuda_img should be `cvcuda.Tensor`"):
+        with pytest.raises(TypeError, match=r"cvcuda_img should be ``cvcuda\.Tensor``\. Got .+\."):
             F.cvcuda_to_tensor(object())
 
 
 
@@ -11,7 +11,7 @@
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
-from torchvision.transforms.v2.functional._utils import _FillType
+from torchvision.transforms.v2.functional._utils import _FillType, _is_cvcuda_available, _is_cvcuda_tensor
 
 from ._transform import _RandomApplyTransform
 from ._utils import (
@@ -30,6 +30,8 @@
     query_size,
 )
 
+CVCUDA_AVAILABLE = _is_cvcuda_available()
+
 
 class RandomHorizontalFlip(_RandomApplyTransform):
     """Horizontally flip the input with a given probability.
@@ -45,6 +47,9 @@ class RandomHorizontalFlip(_RandomApplyTransform):
 
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
+    if CVCUDA_AVAILABLE:
+        _transformed_types = _RandomApplyTransform._transformed_types + (_is_cvcuda_tensor,)
+
     def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
         return self._call_kernel(F.horizontal_flip, inpt)
 
@@ -63,6 +68,9 @@ class RandomVerticalFlip(_RandomApplyTransform):
 
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
+    if CVCUDA_AVAILABLE:
+        _transformed_types = _RandomApplyTransform._transformed_types + (_is_cvcuda_tensor,)
+
     def transform(self, inpt: Any, params: dict[str, Any]) -> Any:
         return self._call_kernel(F.vertical_flip, inpt)