diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/experimental/_layout.pxd
new file mode 100644
index 0000000000..fe0e8d2e58
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_layout.pxd
@@ -0,0 +1,685 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from cython.operator cimport dereference as deref
+
+from libc.stdint cimport int64_t, int32_t, uint32_t, uint64_t, uintptr_t
+from libcpp cimport vector
+
+ctypedef int64_t extent_t
+ctypedef int64_t stride_t
+ctypedef int32_t axis_t
+
+ctypedef uint64_t axes_mask_t  # MUST be exactly STRIDED_LAYOUT_MAX_NDIM bits wide
+ctypedef uint32_t property_mask_t
+
+ctypedef vector.vector[stride_t] extents_strides_t
+ctypedef vector.vector[axis_t] axis_vec_t
+
+from cuda.core.experimental._utils cimport cuda_utils
+
+
+ctypedef fused integer_t:
+    int64_t
+    int32_t
+
+
+cdef extern from "include/layout.hpp":
+
+    cdef int STRIDED_LAYOUT_MAX_NDIM
+    cdef axes_mask_t AXIS_MASK_ALL
+    int64_t _c_abs(int64_t x) nogil
+    void _order_from_strides(axis_vec_t& indices, extent_t* extent_t, stride_t* stride_t, int ndim) except + nogil
+    void _swap(extents_strides_t &a, extents_strides_t &b) noexcept nogil
+    void _swap(int64_t* a, int64_t* b) noexcept nogil
+    void _swap(int a, int b) noexcept nogil
+    void _swap(axis_vec_t &a, axis_vec_t &b) noexcept nogil
+
+
+cdef enum OrderFlag:
+    ORDER_NONE = 0
+    ORDER_C = 1
+    ORDER_F = 2
+    ORDER_PERM = 3
+
+
+cdef enum Property:
+    PROP_IS_UNIQUE = 1 << 0
+    PROP_IS_CONTIGUOUS_C = 1 << 1
+    PROP_IS_CONTIGUOUS_F = 1 << 2
+    PROP_IS_CONTIGUOUS_ANY = 1 << 3
+    PROP_IS_DENSE = 1 << 4
+    PROP_OFFSET_BOUNDS = 1 << 5
+    PROP_REQUIRED_SIZE_IN_BYTES = 1 << 6
+    PROP_SHAPE = 1 << 7
+    PROP_STRIDES = 1 << 8
+    PROP_STRIDES_IN_BYTES = 1 << 9
+    PROP_STRIDE_ORDER = 1 << 10
+    PROP_VOLUME = 1 << 11
+
+
+cdef struct BaseLayout:
+    # A struct holding the shape and strides for the layout.
+    # Use ``init_base_layout`` to initialize the layout, it will
+    # set the ``shape`` and ``strides`` pointers to point to
+    # ndim contigious integer arrays.
+    # The ``shape`` pointer must not be NULL, the ``strides`` can be
+    # set to NULL by the user to indicate C-contiguous layout.
+    # Uses single _mem allocation to reduce overhead
+    # (allocation and exceptions checks).
+
+    extents_strides_t _mem
+    extent_t* shape
+    stride_t* strides
+    int ndim
+
+
+@cython.final
+cdef class StridedLayout:
+
+    # Definition
+    cdef:
+        BaseLayout base
+
+        readonly:
+            int itemsize
+            stride_t slice_offset
+
+    # Lazy properties computed from the defining values.
+    cdef:
+        # Set to 0 to invalidate all properties,
+        # whenever a defining value is changed
+        property_mask_t _prop_mask
+
+        # C and Python properties
+        property_mask_t _boolean_props
+        int64_t _required_size_in_bytes
+        stride_t _min_offset
+        stride_t _max_offset
+        int64_t _volume
+
+        # Python properties
+        tuple _py_shape
+        tuple _py_strides
+        tuple _py_strides_in_bytes
+        tuple _py_stride_order
+
+    # ==============================
+    # Initialization
+    # ==============================
+
+    cdef inline int _init(StridedLayout self, BaseLayout& base, int itemsize, bint divide_strides=False) except -1 nogil:
+        _validate_itemsize(itemsize)
+
+        if base.strides != NULL and divide_strides:
+            _divide_strides(base, itemsize)
+
+        self.itemsize = itemsize
+        self.slice_offset = 0
+
+        _swap_layout(self.base, base)
+        return 0
+
+    cdef inline stride_t _init_dense(StridedLayout self, BaseLayout& base, int itemsize, OrderFlag order_flag, axis_vec_t* stride_order=NULL) except -1 nogil:
+        _validate_itemsize(itemsize)
+
+        cdef stride_t volume
+        if order_flag == ORDER_C:
+            volume = _dense_strides_c(base)
+        elif order_flag == ORDER_F:
+            volume = _dense_strides_f(base)
+        elif order_flag == ORDER_PERM:
+            if stride_order == NULL:
+                raise ValueError("stride_order is required for ORDER_PERM")
+            volume = _dense_strides_in_order(base, deref(stride_order))
+        else:
+            raise ValueError("The stride_order must be 'C', 'F', or a permutation.")
+
+        self.itemsize = itemsize
+        self.slice_offset = 0
+        _swap_layout(self.base, base)
+        self._volume = volume
+        _mark_property_valid(self, PROP_VOLUME)
+        return 0
+
+    cdef inline int init_from_ptr(StridedLayout self, int ndim, extent_t* shape, stride_t* strides, int itemsize, bint divide_strides=False) except -1 nogil:
+        cdef BaseLayout base
+        _init_base_layout_from_ptr(base, ndim, shape, strides)
+        return self._init(base, itemsize, divide_strides)
+
+    cdef inline int init_dense_from_ptr(StridedLayout self, int ndim, extent_t* shape, int itemsize, OrderFlag order_flag, axis_vec_t* stride_order=NULL) except -1 nogil:
+        cdef BaseLayout base
+        _init_base_layout_from_ptr(base, ndim, shape, NULL)
+        return self._init_dense(base, itemsize, order_flag, stride_order)
+
+    cdef inline int init_from_tuple(StridedLayout self, tuple shape, tuple strides, int itemsize, bint divide_strides=False) except -1:
+        cdef BaseLayout base
+        _init_base_layout_from_tuple(base, shape, strides)
+        return self._init(base, itemsize, divide_strides)
+
+    cdef inline int init_dense_from_tuple(StridedLayout self, tuple shape, int itemsize, object stride_order) except -1:
+        cdef axis_vec_t stride_order_vec
+        cdef OrderFlag order_flag = _stride_order2vec(stride_order_vec, stride_order)
+
+        if order_flag == ORDER_NONE:
+            raise ValueError(f"The stride_order must be 'C', 'F', or a permutation tuple. Got: {stride_order}")
+
+        cdef BaseLayout base
+        _init_base_layout_from_tuple(base, shape, None)
+        return self._init_dense(base, itemsize, order_flag, &stride_order_vec)
+
+    # ==============================
+    # Properties
+    # ==============================
+
+    cdef inline tuple get_shape_tuple(StridedLayout self):
+        if not _has_valid_property(self, PROP_SHAPE):
+            self._py_shape = cuda_utils.carray_integer_t_to_tuple(self.base.shape, self.base.ndim)
+            _mark_property_valid(self, PROP_SHAPE)
+        return self._py_shape
+
+    cdef inline tuple get_strides_tuple(StridedLayout self):
+        if not _has_valid_property(self, PROP_STRIDES):
+            if self.base.strides == NULL:
+                self._py_strides = None
+            else:
+                self._py_strides = cuda_utils.carray_integer_t_to_tuple(self.base.strides, self.base.ndim)
+            _mark_property_valid(self, PROP_STRIDES)
+        return self._py_strides
+
+    cdef inline int get_strides_in_bytes(StridedLayout self, extents_strides_t& strides) except -1 nogil:
+        if self.base.strides != NULL:
+            strides.resize(self.base.ndim)
+            for i in range(self.base.ndim):
+                strides[i] = _overflow_checked_mul(self.base.strides[i], self.itemsize)
+        return 0
+
+    cdef inline tuple get_strides_in_bytes_tuple(StridedLayout self):
+        if _has_valid_property(self, PROP_STRIDES_IN_BYTES):
+            return self._py_strides_in_bytes
+        cdef extents_strides_t strides
+        if self.base.strides == NULL:
+            self._py_strides_in_bytes = None
+        else:
+            self.get_strides_in_bytes(strides)
+            self._py_strides_in_bytes = cuda_utils.carray_integer_t_to_tuple(strides.data(), strides.size())
+        _mark_property_valid(self, PROP_STRIDES_IN_BYTES)
+        return self._py_strides_in_bytes
+
+    cdef inline int64_t get_volume(StridedLayout self) except -1 nogil:
+        if not _has_valid_property(self, PROP_VOLUME):
+            self._volume = _volume(self.base)
+            _mark_property_valid(self, PROP_VOLUME)
+        return self._volume
+
+    cdef inline int get_stride_order(StridedLayout self, axis_vec_t& stride_order) except -1 nogil:
+        _order_from_strides(stride_order, self.base.shape, self.base.strides, self.base.ndim)
+        return 0
+
+    cdef inline tuple get_stride_order_tuple(StridedLayout self):
+        if _has_valid_property(self, PROP_STRIDE_ORDER):
+            return self._py_stride_order
+        cdef axis_vec_t stride_order
+        self.get_stride_order(stride_order)
+        self._py_stride_order = cuda_utils.carray_integer_t_to_tuple(stride_order.data(), stride_order.size())
+        _mark_property_valid(self, PROP_STRIDE_ORDER)
+        return self._py_stride_order
+
+    cdef inline bint get_is_unique(StridedLayout self) except -1 nogil:
+        if _has_valid_property(self, PROP_IS_UNIQUE):
+            return _boolean_property(self, PROP_IS_UNIQUE)
+        if self.base.strides == NULL or self.get_volume() == 0:
+            return _set_boolean_property(self, PROP_IS_UNIQUE, True)
+        cdef axis_vec_t stride_order
+        self.get_stride_order(stride_order)
+        return _set_boolean_property(self, PROP_IS_UNIQUE, _is_unique(self.base, stride_order))
+
+    cdef inline bint get_is_contiguous_c(StridedLayout self) except -1 nogil:
+        if _has_valid_property(self, PROP_IS_CONTIGUOUS_C):
+            return _boolean_property(self, PROP_IS_CONTIGUOUS_C)
+        return _set_boolean_property(self, PROP_IS_CONTIGUOUS_C, _is_contiguous_c(self.get_volume(), self.base))
+
+    cdef inline bint get_is_contiguous_f(StridedLayout self) except -1 nogil:
+        if _has_valid_property(self, PROP_IS_CONTIGUOUS_F):
+            return _boolean_property(self, PROP_IS_CONTIGUOUS_F)
+        return _set_boolean_property(self, PROP_IS_CONTIGUOUS_F, _is_contiguous_f(self.get_volume(), self.base))
+
+    cdef inline bint get_is_contiguous_any(StridedLayout self) except -1 nogil:
+        if _has_valid_property(self, PROP_IS_CONTIGUOUS_ANY):
+            return _boolean_property(self, PROP_IS_CONTIGUOUS_ANY)
+        cdef axis_vec_t stride_order
+        self.get_stride_order(stride_order)
+        return _set_boolean_property(self, PROP_IS_CONTIGUOUS_ANY, _is_contiguous_any(self.get_volume(), self.base, stride_order))
+
+    cdef inline bint get_is_dense(StridedLayout self) except -1 nogil:
+        if _has_valid_property(self, PROP_IS_DENSE):
+            return _boolean_property(self, PROP_IS_DENSE)
+        return _set_boolean_property(self, PROP_IS_DENSE, self.slice_offset == 0 and self.get_is_contiguous_any())
+
+    cdef inline int get_offset_bounds(StridedLayout self, stride_t& min_offset, stride_t& max_offset) except -1 nogil:
+        if _has_valid_property(self, PROP_OFFSET_BOUNDS):
+            min_offset = self._min_offset
+            max_offset = self._max_offset
+            return 0
+        cdef int ndim = self.base.ndim
+        cdef stride_t stride
+        cdef extent_t extent
+        min_offset = self.slice_offset
+        max_offset = self.slice_offset
+        if self.base.strides == NULL:
+            max_offset = _overflow_checked_sum(max_offset, self.get_volume() - 1)
+        else:
+            for i in range(ndim):
+                stride = self.base.strides[i]  # can be negative
+                extent = self.base.shape[i]  # must be non-negative
+                if extent == 0:
+                    min_offset = 0
+                    max_offset = -1  # empty range
+                    return 0
+                if stride <= 0:
+                    min_offset = _overflow_checked_sum(min_offset, _overflow_checked_mul(stride, (extent - 1)))
+                else:
+                    max_offset = _overflow_checked_sum(max_offset, _overflow_checked_mul(stride, (extent - 1)))
+        self._min_offset = min_offset
+        self._max_offset = max_offset
+        _mark_property_valid(self, PROP_OFFSET_BOUNDS)
+        return 0
+
+    cdef inline int64_t get_required_size_in_bytes(StridedLayout self) except? -1 nogil:
+        if _has_valid_property(self, PROP_REQUIRED_SIZE_IN_BYTES):
+            return self._required_size_in_bytes
+        cdef stride_t min_offset = 0
+        cdef stride_t max_offset = 0
+        self.get_offset_bounds(min_offset, max_offset)
+        if min_offset < 0:
+            raise ValueError(
+                f"Allocation size for a layout that maps elements "
+                f"to negative memory offsets is ambiguous. "
+                f"The layout's min_offset is {min_offset}. "
+                f"To create a supported layout with the same shape "
+                f"please use StridedLayout.to_dense()."
+            )
+        if max_offset < min_offset:
+            return 0
+        cdef int64_t required_size_in_bytes = _overflow_checked_sum(max_offset, 1)
+        self._required_size_in_bytes = _overflow_checked_mul(required_size_in_bytes, self.itemsize)
+        _mark_property_valid(self, PROP_REQUIRED_SIZE_IN_BYTES)
+        return self._required_size_in_bytes
+
+    cdef inline int64_t get_slice_offset_in_bytes(StridedLayout self) except? -1 nogil:
+        return _overflow_checked_mul(self.slice_offset, self.itemsize)
+
+    cdef axes_mask_t get_flattened_axis_mask(StridedLayout self) except? -1 nogil
+    cdef int get_max_compatible_itemsize(StridedLayout self, int max_itemsize, uintptr_t data_ptr, int axis=*) except -1 nogil
+
+    # ==============================
+    # Layout manipulation
+    # ==============================
+
+
+    cdef int reshape_into(StridedLayout self, StridedLayout out_layout, BaseLayout& new_shape) except -1 nogil
+    cdef int permute_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_order) except -1 nogil
+
+    cdef int flatten_into(StridedLayout self, StridedLayout out_layout, axes_mask_t axis_mask=*) except -1 nogil
+    cdef int squeeze_into(StridedLayout self, StridedLayout out_layout) except -1 nogil
+    cdef int unsqueeze_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_vec) except -1 nogil
+    cdef int broadcast_into(StridedLayout self, StridedLayout out_layout, BaseLayout& broadcast) except -1 nogil
+    cdef int pack_into(StridedLayout self, StridedLayout out_layout, int itemsize, uintptr_t data_ptr, bint keep_dim, int axis=*) except -1 nogil
+    cdef int unpack_into(StridedLayout self, StridedLayout out_layout, int itemsize, int axis=*) except -1 nogil
+    cdef int slice_into(StridedLayout self, StridedLayout out_layout, tuple slices) except -1
+
+# ==============================
+# Base layout helpers
+# ==============================
+
+
+cdef inline int init_base_layout(BaseLayout& layout, int ndim) except -1 nogil:
+    if ndim > STRIDED_LAYOUT_MAX_NDIM:
+        raise ValueError(f"Unsupported number of dimensions: {ndim}. Max supported ndim is {STRIDED_LAYOUT_MAX_NDIM}")
+    # resize(0) is no op, that results in _mem.data() being NULL,
+    # which would make it tricky to distinguish between strides == NULL
+    # and strides == tuple()
+    layout._mem.resize(2 * max(ndim, 1))
+    layout.shape = layout._mem.data()
+    layout.strides = layout._mem.data() + ndim
+    layout.ndim = ndim
+    return 0
+
+
+cdef inline int trim_base_layout(BaseLayout& layout, int ndim) except -1 nogil:
+    if ndim > layout.ndim:
+        raise AssertionError(f"Cannot trim layout to {ndim} dimensions, it has {layout.ndim} dimensions")
+    layout.ndim = ndim
+    return 0
+
+
+cdef inline void _swap_layout(BaseLayout& a, BaseLayout& b) noexcept nogil:
+    _swap(a._mem, b._mem)
+    _swap(a.shape, b.shape)
+    _swap(a.strides, b.strides)
+    _swap(a.ndim, b.ndim)
+
+
+cdef inline void _assure_strides_ptr(BaseLayout& base) noexcept nogil:
+    if base.strides == NULL:
+        base.strides = base._mem.data() + base._mem.size() // 2
+
+
+cdef inline stride_t *get_strides_ptr(BaseLayout& base) except? NULL nogil:
+    if base.strides != NULL:
+        return base.strides
+    cdef stride_t* tmp_strides = base._mem.data() + base._mem.size() // 2
+    _dense_strides_c_ptrs(base.ndim, base.shape, tmp_strides)
+    return tmp_strides
+
+
+cdef inline bint _base_layout_equal(BaseLayout& a, BaseLayout& b) noexcept nogil:
+    if a.ndim != b.ndim:
+        return False
+    for i in range(a.ndim):
+        if a.shape[i] != b.shape[i]:
+            return False
+    if a.strides != NULL or b.strides != NULL:
+        if a.strides == NULL or b.strides == NULL:
+            return False
+        for i in range(a.ndim):
+            if a.strides[i] != b.strides[i]:
+                return False
+    return True
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t _volume(BaseLayout& base) except? -1 nogil:
+    cdef int64_t vol = 1
+    for i in range(base.ndim):
+        vol *= base.shape[i]
+    return vol
+
+
+cdef inline int _divide_strides(BaseLayout& base, int itemsize) except -1 nogil:
+    cdef stride_t stride
+    if base.strides == NULL:
+        raise ValueError("cannot divide strides, layout has no strides")
+    for i in range(base.ndim):
+        stride = base.strides[i] // itemsize
+        if stride * itemsize != base.strides[i]:
+            raise ValueError("strides must be divisible by itemsize")
+        base.strides[i] = stride
+    return 0
+
+
+cdef inline void _zero_strides_ptr(int ndim, stride_t* strides) noexcept nogil:
+    for i in range(ndim):
+        strides[i] = 0
+
+
+cdef inline void _zero_strides(BaseLayout& base) noexcept nogil:
+    _assure_strides_ptr(base)
+    _zero_strides_ptr(base.ndim, base.strides)
+
+
+cdef inline stride_t _dense_strides_c_ptrs(int ndim, extent_t* shape, stride_t* strides) except? -1 nogil:
+    cdef stride_t stride = 1
+    cdef int i = ndim - 1
+    while i >= 0:
+        strides[i] = stride
+        stride = _overflow_checked_mul(stride, shape[i])
+        i -= 1
+    if stride == 0:
+        _zero_strides_ptr(ndim, strides)
+    return stride
+
+
+cdef inline stride_t _dense_strides_c(BaseLayout& base) except? -1 nogil:
+    cdef int ndim = base.ndim
+    _assure_strides_ptr(base)
+    return _dense_strides_c_ptrs(ndim, base.shape, base.strides)
+
+
+cdef inline stride_t _dense_strides_f(BaseLayout& base) except? -1 nogil:
+    cdef int ndim = base.ndim
+    _assure_strides_ptr(base)
+    cdef stride_t stride = 1
+    cdef int i = 0
+    while i < ndim:
+        base.strides[i] = stride
+        stride = _overflow_checked_mul(stride, base.shape[i])
+        i += 1
+    if stride == 0:
+        _zero_strides(base)
+    return stride
+
+
+cdef inline stride_t _dense_strides_in_order(BaseLayout& base, axis_vec_t& stride_order) except? -1 nogil:
+    cdef int ndim = base.ndim
+    if <size_t>ndim != stride_order.size():
+        raise ValueError(f"stride_order must have the same length as shape. Shape has {ndim} dimensions, but stride_order has {stride_order.size()} elements.")
+    _assure_strides_ptr(base)
+    cdef stride_t stride = 1
+    cdef int i = ndim - 1
+    cdef axes_mask_t axis_order_mask = 0
+    cdef axes_mask_t axis_mask
+    cdef axis_t axis
+    while i >= 0:
+        axis = stride_order[i]
+        if not _normalize_axis(axis, ndim):
+            raise ValueError(f"Invalid stride order: axis {axis} out of range for {ndim}D tensor")
+        axis_mask = _axis2mask(axis)
+        if axis_order_mask & axis_mask:
+            raise ValueError(f"The stride order must be a permutation. Axis {axis} appears multiple times.")
+        axis_order_mask |= axis_mask
+        base.strides[axis] = stride
+        stride = _overflow_checked_mul(stride, base.shape[axis])
+        i -= 1
+    if stride == 0:
+        _zero_strides(base)
+    return stride
+
+
+cdef inline bint _is_contiguous_c(int64_t volume, BaseLayout& base) except -1 nogil:
+    if volume == 0 or base.strides == NULL:
+        return True
+    cdef int64_t stride = 1
+    cdef int64_t j = base.ndim - 1
+    cdef extent_t extent
+    while j >= 0:
+        extent = base.shape[j]
+        if extent != 1:
+            if base.strides[j] != stride:
+                return False
+            stride *= extent
+        j -= 1
+    return True
+
+
+cdef inline bint _is_contiguous_f(int64_t volume, BaseLayout& base) except -1 nogil:
+    if volume == 0:
+        return True
+    cdef int ndim = base.ndim
+    cdef int64_t j = 0
+    if base.strides == NULL:
+        # find first non-singleton dimension
+        while j < ndim and base.shape[j] == 1:
+            j += 1
+        # if any subsequent dimension is not a singleton, return False
+        for i in range(j + 1, ndim):
+            if base.shape[i] != 1:
+                return False
+        return True
+    cdef int64_t stride = 1
+    cdef extent_t extent
+    while j < ndim:
+        extent = base.shape[j]
+        if extent != 1:
+            if base.strides[j] != stride:
+                return False
+            stride *= extent
+        j += 1
+    return True
+
+
+cdef inline bint _is_contiguous_any(int64_t volume, BaseLayout& base, axis_vec_t& axis_order) except -1 nogil:
+    if volume == 0 or base.strides == NULL:
+        return True
+    cdef int64_t stride = 1
+    cdef int64_t j = base.ndim - 1
+    cdef axis_t axis
+    cdef extent_t extent
+    while j >= 0:
+        axis = axis_order[j]
+        extent = base.shape[axis]
+        if extent != 1:
+            if base.strides[axis] != stride:
+                return False
+            stride *= extent
+        j -= 1
+    return True
+
+
+cdef inline int _validate_shape(BaseLayout& base) except -1 nogil:
+    for i in range(base.ndim):
+        if base.shape[i] < 0:
+            raise ValueError("Extents must be non-negative")
+    return 0
+
+
+cdef inline int _init_base_layout_from_tuple(BaseLayout& base, tuple shape, tuple strides) except -1:
+    cdef int ndim = len(shape)
+    init_base_layout(base, ndim)
+    for i in range(ndim):
+        base.shape[i] = shape[i]
+    _validate_shape(base)
+
+    if strides is None:
+        base.strides = NULL
+    else:
+        if len(strides) != ndim:
+            raise ValueError(f"Strides, if provided, must have the same length as shape. Shape has {ndim} dimensions, but strides has {len(strides)} elements.")
+        for i in range(ndim):
+            base.strides[i] = strides[i]
+    return 0
+
+
+cdef inline int _init_base_layout_from_ptr(BaseLayout& base, int ndim, extent_t* shape, stride_t* strides) except -1 nogil:
+    init_base_layout(base, ndim)
+    for i in range(ndim):
+        base.shape[i] = shape[i]
+    _validate_shape(base)
+
+    if strides == NULL:
+        base.strides = NULL
+    else:
+        for i in range(ndim):
+            base.strides[i] = strides[i]
+    return 0
+
+# ==============================
+# Strided layout helpers
+# ==============================
+
+
+cdef inline bint _has_valid_property(StridedLayout self, Property prop) noexcept nogil:
+    return self._prop_mask & prop
+
+
+cdef inline void _mark_property_valid(StridedLayout self, Property prop) noexcept nogil:
+    self._prop_mask |= prop
+
+
+cdef inline bint _boolean_property(StridedLayout self, Property prop) noexcept nogil:
+    return self._boolean_props & prop
+
+
+cdef inline bint _set_boolean_property(StridedLayout self, Property prop, bint value) noexcept nogil:
+    if value:
+        self._boolean_props |= prop
+    else:
+        self._boolean_props &= ~prop
+    _mark_property_valid(self, prop)
+    return value
+
+
+# ==============================
+# Conversion, validation and normalization helpers
+# ==============================
+
+
+cdef inline axes_mask_t _axis2mask(axis_t axis) noexcept nogil:
+    return 1ULL << axis
+
+
+cdef inline OrderFlag _stride_order2vec(axis_vec_t& stride_order_vec, object stride_order) except? ORDER_NONE:
+    if stride_order == 'C':
+        return ORDER_C
+    elif stride_order == 'F':
+        return ORDER_F
+    elif isinstance(stride_order, tuple | list):
+        _tuple2axis_vec(stride_order_vec, stride_order)
+        return ORDER_PERM
+    return ORDER_NONE
+
+
+cdef inline int _tuple2axis_vec(axis_vec_t& vec, object t) except -1:
+    cdef int ndim = len(t)
+    vec.resize(ndim)
+    for i in range(ndim):
+        vec[i] = t[i]
+    return 0
+
+
+cdef inline bint _normalize_axis(integer_t& axis, integer_t extent) except -1 nogil:
+    if axis < -extent or axis >= extent:
+        return False
+    if axis < 0:
+        axis += extent
+    return True
+
+
+cdef inline int _validate_itemsize(int itemsize) except -1 nogil:
+    if itemsize <= 0:
+        raise ValueError("itemsize must be positive")
+    if itemsize & (itemsize - 1):
+        raise ValueError("itemsize must be a power of two")
+    return 0
+
+
+cdef inline bint _is_unique(BaseLayout& base, axis_vec_t& stride_order) except -1 nogil:
+    if base.strides == NULL:
+        return True
+    cdef int64_t cur_max_offset = 0
+    cdef int i = base.ndim - 1
+    cdef int64_t stride
+    cdef axis_t axis
+    cdef extent_t extent
+    while i >= 0:
+        axis = stride_order[i]
+        extent = base.shape[axis]
+        if extent != 1:
+            stride = _c_abs(base.strides[axis])
+            if cur_max_offset >= stride:
+                return False
+            cur_max_offset = _overflow_checked_sum(cur_max_offset, _overflow_checked_mul(stride, (extent - 1)))
+        i -= 1
+    return True
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t _overflow_checked_mul(int64_t a, int64_t b) except? -1 nogil:
+    return a * b
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t _overflow_checked_diff(int64_t a, int64_t b) except? -1 nogil:
+    return a - b
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t _overflow_checked_sum(int64_t a, int64_t b) except? -1 nogil:
+    return a + b
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t _overflow_checked_div_ceil(int64_t a, int64_t b) except? -1 nogil:
+    return (a + b - 1) // b
diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/experimental/_layout.pyx
new file mode 100644
index 0000000000..f1aa1ec78b
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_layout.pyx
@@ -0,0 +1,1323 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+
+from libc.stdint cimport int64_t, uintptr_t
+
+from cpython.object cimport PyObject
+
+
+cdef extern from "Python.h":
+    int _PySlice_Unpack "PySlice_Unpack" (PyObject *slice, Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step) except -1
+    Py_ssize_t _PySlice_AdjustIndices "PySlice_AdjustIndices" (Py_ssize_t length, Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t step) noexcept nogil
+
+
+@cython.final
+cdef class StridedLayout:
+    """
+    A class describing the layout of a multi-dimensional tensor
+    with a shape, strides and itemsize.
+
+    Parameters
+    ----------
+    shape : tuple
+        A tuple of non-negative integers.
+    strides : tuple, optional
+        If provided, must be a tuple of integers of the same length as ``shape``.
+        Otherwise, the strides are assumed to be implicitly C-contiguous and the resulting
+        layout's :attr:`strides` will be None.
+    itemsize : int
+        The number of bytes per single element (dtype size). Must be a power of two.
+    divide_strides : bool, optional
+        If True, the provided :attr:`strides` will be divided by the :attr:`itemsize`.
+
+
+    See also :meth:`dense`.
+
+
+    Attributes
+    ----------
+    itemsize : int
+        The number of bytes per single element (dtype size). Must be a power of two.
+    slice_offset : int
+        The offset (as a number of elements, not bytes) of the element at
+        index ``(0,) * ndim``. See also :attr:`slice_offset_in_bytes`.
+    """
+
+    def __init__(
+        self : StridedLayout,
+        shape : tuple[int],
+        strides : tuple[int] | None,
+        itemsize : int,
+        divide_strides : bool = False
+    ) -> None:
+        self.init_from_tuple(shape, strides, itemsize, divide_strides)
+
+    @classmethod
+    def dense(
+        cls,
+        shape : tuple[int],
+        itemsize : int,
+        stride_order : str | tuple[int] = 'C'
+    ) -> StridedLayout:
+        """
+        Creates a new StridedLayout instance with dense strides.
+
+        Parameters
+        ----------
+        shape : tuple
+            A tuple of non-negative integers.
+        itemsize : int
+            The number of bytes per single element of the tensor.
+        stride_order : str or tuple, optional
+            The order of the strides:
+                * 'C' (default) - the strides are computed in C-order (increasing from the right to the left)
+                * 'F' - the strides are computed in F-order (increasing from the left to the right)
+                * A tuple - it must be a permutation of ``tuple(range(len(shape)))``.
+                  The last element of the tuple is the axis with stride 1.
+
+            See also :attr:`stride_order`.
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            assert StridedLayout.dense((5, 3, 7), 1, "C") == StridedLayout((5, 3, 7), (21, 7, 1), 1)
+            assert StridedLayout.dense((5, 3, 7), 1, "F") == StridedLayout((5, 3, 7), (1, 5, 15), 1)
+            assert StridedLayout.dense((5, 3, 7), 1, (2, 0, 1)) == StridedLayout((5, 3, 7), (3, 1, 15), 1)
+
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(cls)
+        new_layout.init_dense_from_tuple(shape, itemsize, stride_order)
+        return new_layout
+
+    @classmethod
+    def dense_like(
+        cls,
+        other : StridedLayout,
+        stride_order : str | tuple[int] = "K"
+    ) -> StridedLayout:
+        """
+        Creates a StridedLayout with the same :attr:`shape` and :attr:`itemsize` as the other layout,
+        but with contiguous strides in the specified order and no slice offset.
+
+        See also :attr:`is_dense`.
+
+        Parameters
+        ----------
+        other : StridedLayout
+            The StridedLayout to copy the :attr:`shape` and :attr:`itemsize` from.
+        stride_order : str or tuple, optional
+            The order of the strides:
+                * 'K' (default) - keeps the order of the strides as in the ``other`` layout.
+                * 'C' - the strides are computed in C-order (increasing from the right to the left)
+                * 'F' - the strides are computed in F-order (increasing from the left to the right)
+                * A tuple - it must be a permutation of ``tuple(range(len(shape)))``.
+                  The last element of the tuple is the axis with stride 1.
+
+            See also :attr:`stride_order`.
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = StridedLayout.dense((5, 3, 7), 1).permuted((2, 0, 1))
+            assert layout == StridedLayout((7, 5, 3), (1, 21, 7), 1)
+
+            # dense_like with the default "K" stride_order
+            # keeps the same order of strides as in the original layout
+            assert StridedLayout.dense_like(layout) == layout
+            # "C", "F" recompute the strides accordingly
+            assert StridedLayout.dense_like(layout, "C") == StridedLayout((7, 5, 3), (15, 3, 1), 1)
+            assert StridedLayout.dense_like(layout, "F") == StridedLayout((7, 5, 3), (1, 7, 35), 1)
+        """
+        cdef OrderFlag order_flag
+        cdef axis_vec_t stride_order_vec
+        cdef bint is_dense = other.get_is_dense()
+
+        if stride_order == "K":
+            if is_dense:
+                return other
+            other.get_stride_order(stride_order_vec)
+            order_flag = ORDER_PERM
+        else:
+            order_flag = _stride_order2vec(stride_order_vec, stride_order)
+            if order_flag == ORDER_NONE:
+                raise ValueError(
+                    f"The stride_order must be 'K', 'C', 'F', "
+                    f"or a permutation tuple. Got: {stride_order}"
+                )
+            elif order_flag == ORDER_C:
+                if is_dense and other.get_is_contiguous_c():
+                    return other
+            elif order_flag == ORDER_F:
+                if is_dense and other.get_is_contiguous_f():
+                    return other
+
+        cdef StridedLayout new_layout = StridedLayout.__new__(cls)
+        new_layout.init_dense_from_ptr(
+            other.base.ndim,
+            other.base.shape,
+            other.itemsize,
+            order_flag,
+            &stride_order_vec
+        )
+        return new_layout
+
+    def __repr__(self : StridedLayout) -> str:
+        if self.slice_offset == 0:
+            return (
+                f"StridedLayout(shape={self.shape}, strides={self.strides}, itemsize={self.itemsize})"
+            )
+        else:
+            return (
+                f"StridedLayout(shape={self.shape}, strides={self.strides}, itemsize={self.itemsize}, _slice_offset={self.slice_offset})"
+            )
+
+    def __eq__(self : StridedLayout, other : StridedLayout) -> bool:
+        return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and _base_layout_equal(self.base, other.base)
+
+    @property
+    def ndim(self : StridedLayout):
+        """
+        The number of dimensions (length of the shape tuple).
+
+        :type: int
+        """
+        return self.base.ndim
+
+    @property
+    def shape(self : StridedLayout):
+        """
+        Shape of the tensor.
+
+        :type: tuple[int]
+        """
+        return self.get_shape_tuple()
+
+    @property
+    def strides(self : StridedLayout):
+        """
+        Strides of the tensor (in **counts**, not bytes).
+        If StridedLayout was created with strides=None, the
+        returned value is None and layout is implicitly C-contiguous.
+
+        :type: tuple[int] | None
+        """
+        return self.get_strides_tuple()
+
+    @property
+    def strides_in_bytes(self : StridedLayout):
+        """
+        Strides of the tensor (in bytes).
+
+        :type: tuple[int] | None
+        """
+        return self.get_strides_in_bytes_tuple()
+
+    @property
+    def stride_order(self : StridedLayout):
+        """
+        A permutation of ``tuple(range(ndim))`` describing the
+        relative order of the strides.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # C-contiguous layout
+            assert StridedLayout.dense((5, 3, 7), 1).stride_order == (0, 1, 2)
+            # F-contiguous layout
+            assert StridedLayout.dense((5, 3, 7), 1, stride_order="F").stride_order == (2, 1, 0)
+            # Permuted layout
+            assert StridedLayout.dense((5, 3, 7), 1, stride_order=(2, 0, 1)).stride_order == (2, 0, 1)
+
+        :type: tuple[int]
+        """
+        return self.get_stride_order_tuple()
+
+    @property
+    def volume(self : StridedLayout):
+        """
+        The number of elements in the tensor, i.e. the product of the shape tuple.
+
+        :type: int
+        """
+        return self.get_volume()
+
+    @property
+    def is_unique(self : StridedLayout):
+        """
+        If True, each element of a tensor with this layout is mapped to
+        a unique memory offset.
+
+        All contiguous layouts are unique and so are layouts that can be created
+        by permuting, slicing, flattening, squeezing, repacking, or reshaping
+        a contiguous layout.
+        Conversely, broadcast layouts (layouts with a 0 stride
+        for some extent greater than 1) are not unique.
+
+        For layouts resulting from manual stride manipulations
+        (such as with ``numpy.lib.stride_tricks``), the check
+        may inaccurately report False, as the exact uniqueness
+        check may be expensive.
+
+        :type: bool
+        """
+        return self.get_is_unique()
+
+    @property
+    def is_contiguous_c(self : StridedLayout):
+        """
+        True iff the layout is contiguous in C-order, i.e.
+        the rightmost stride is 1 and each subsequent
+        stride to the left is the product of the
+        extent and the stride to the right.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = StridedLayout.dense((2, 5, 3), 1, "C")
+            assert layout == StridedLayout((2, 5, 3), (15, 3, 1), 1)
+            assert layout.is_contiguous_c
+
+        See also :attr:`is_contiguous_any`.
+
+        :type: bool
+        """
+        return self.get_is_contiguous_c()
+
+    @property
+    def is_contiguous_f(self : StridedLayout):
+        """
+        True iff the layout is contiguous in F-order, i.e.
+        the leftmost stride is 1 and each subsequent
+        stride to the right is the product of the
+        stride and extent to the left.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = StridedLayout.dense((2, 5, 3), 1, "F")
+            assert layout == StridedLayout((2, 5, 3), (1, 2, 10), 1)
+            assert layout.is_contiguous_f
+
+        See also :attr:`is_contiguous_any`.
+
+        :type: bool
+        """
+        return self.get_is_contiguous_f()
+
+    @property
+    def is_contiguous_any(self : StridedLayout):
+        """
+        True iff the layout is contiguous in some axis order, i.e.
+        there exists a permutation of axes such that the layout
+        is C-contiguous.
+
+        In a contiguous layout, the strides are non-negative and
+        the mapping of elements to the memory offset range
+        ``[min_offset, max_offset]`` is 1-to-1.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # dense defaults to C-contiguous
+            layout = StridedLayout.dense((5, 3, 7), 1)
+            assert layout.is_contiguous_c and not layout.is_contiguous_f
+            assert layout.is_contiguous_any
+
+            # reversing the order of axes gives F-contiguous layout
+            permuted = layout.permuted((2, 1, 0))
+            assert not permuted.is_contiguous_c and permuted.is_contiguous_f
+            assert permuted.is_contiguous_any
+
+            # neither C- nor F-order but still contiguous
+            permuted = layout.permuted((2, 0, 1))
+            assert not permuted.is_contiguous_c and not permuted.is_contiguous_f
+            assert permuted.is_contiguous_any
+
+            # slicing the right-most extent creates a gap in the
+            # offset_bounds range that is not reachable with any
+            # element in the sliced layout
+            sliced = layout[:, :, :-1]
+            assert not sliced.is_contiguous_c and not sliced.is_contiguous_f
+            assert not sliced.is_contiguous_any
+
+        :type: bool
+        """
+        return self.get_is_contiguous_any()
+
+    @property
+    def is_dense(self : StridedLayout):
+        """
+        A dense layout is contiguous (:attr:`is_contiguous_any` is True)
+        and has no slice offset (:attr:`slice_offset_in_bytes` is 0).
+
+        In a dense layout, elements are mapped 1-to-1 to the ``[0, volume - 1]``
+        memory offset range.
+
+        :type: bool
+        """
+        return self.get_is_dense()
+
+    @property
+    def offset_bounds(self : StridedLayout):
+        """
+        The memory offset range ``[min_offset, max_offset]`` (in element counts, not bytes)
+        that elements of a tensor with this layout are mapped to.
+
+        If the layout is empty (i.e. ``volume == 0``), the returned tuple is ``(0, -1)``.
+        Otherwise, ``min_offset <= max_offset`` and all elements of the tensor with
+        this layout are mapped within the ``[min_offset, max_offset]`` range.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Possible implementation of the offset_bounds
+            def offset_bounds(layout : StridedLayout):
+                if layout.volume == 0:
+                    return 0, -1
+                ndim = layout.ndim
+                shape = layout.shape
+                strides = layout.strides
+                idx_min = [shape[i] - 1 if strides[i] < 0 else 0 for i in range(ndim)]
+                idx_max = [shape[i] - 1 if strides[i] > 0 else 0 for i in range(ndim)]
+                min_offset = sum(strides[i] * idx_min[i] for i in range(ndim)) + layout.slice_offset
+                max_offset = sum(strides[i] * idx_max[i] for i in range(ndim)) + layout.slice_offset
+                return min_offset, max_offset
+
+        :type: tuple[int, int]
+        """
+        cdef stride_t min_offset = 0
+        cdef stride_t max_offset = 0
+        self.get_offset_bounds(min_offset, max_offset)
+        return min_offset, max_offset
+
+    @property
+    def min_offset(self : StridedLayout):
+        """
+        See :attr:`offset_bounds` for details.
+
+        :type: int
+        """
+        cdef stride_t min_offset = 0
+        cdef stride_t max_offset = 0
+        self.get_offset_bounds(min_offset, max_offset)
+        return min_offset
+
+    @property
+    def max_offset(self : StridedLayout):
+        """
+        See :attr:`offset_bounds` for details.
+
+        :type: int
+        """
+        cdef stride_t min_offset = 0
+        cdef stride_t max_offset = 0
+        self.get_offset_bounds(min_offset, max_offset)
+        return max_offset
+
+    @property
+    def slice_offset_in_bytes(self : StridedLayout):
+        """
+        The memory offset (as a number of bytes) of the element at index ``(0,) * ndim``.
+        Equal to :attr:`itemsize` ``*`` :attr:`slice_offset`.
+
+        .. note::
+            The only way for the index ``(0,) * ndim`` to be mapped to a non-zero offset
+            is slicing with :meth:`sliced` method (or ``[]`` operator).
+
+        :type: int
+        """
+        return self.get_slice_offset_in_bytes()
+
+    def required_size_in_bytes(self : StridedLayout) -> int:
+        """
+        The memory allocation size (in bytes) needed so that
+        all elements of a tensor with this layout can be mapped
+        within the allocated memory range.
+
+        The function raises an error if ``min_offset < 0``.
+        Otherwise, the returned value is equal to
+        ``(max_offset + 1) * itemsize``.
+
+        .. hint::
+            For dense layouts, the function always succeeds and the
+            ``(max_offset + 1) * itemsize`` is equal to the ``volume * itemsize``.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Allocating memory on a device to copy a host tensor
+            def device_tensor_like(a : numpy.ndarray, device : ccx.Device) -> StridedMemoryView:
+                a_view = StridedMemoryView(a, -1)
+                # get the original layout of ``a`` and convert it to a dense layout
+                # to avoid overallocating memory (e.g. if the ``a`` was sliced)
+                layout = a_view.layout.to_dense()
+                # get the required size in bytes to fit the tensor
+                required_size = layout.required_size_in_bytes()
+                # allocate the memory on the device
+                device.set_current()
+                mem = device.allocate(required_size)
+                # create a view on the newly allocated device memory
+                b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype)
+                return b_view
+        """
+        return self.get_required_size_in_bytes()
+
+    def flattened_axis_mask(self : StridedLayout) -> axes_mask_t:
+        """
+        A mask describing which axes of this layout are mergeable
+        using the :meth:`flattened` method.
+        """
+        return self.get_flattened_axis_mask()
+
+    def to_dense(self : StridedLayout, object stride_order="K") -> StridedLayout:
+        """
+        Returns a dense layout with the same shape and itemsize,
+        but with dense strides in the specified order.
+
+        See :meth:`dense_like` method documentation for details.
+        """
+        return StridedLayout.dense_like(self, stride_order)
+
+    def reshaped(self : StridedLayout, shape : tuple[int]) -> StridedLayout:
+        """
+        Returns a layout with the new shape, if the new shape is compatible
+        with the current layout.
+
+        The new shape is compatible if:
+            * the new and old shapes have the same volume
+            * the old strides can be split or flattened to match the new shape,
+              assuming indices are iterated in C-order
+
+        A single extent in the ``shape`` tuple can be set to -1 to indicate
+        it should be inferred from the old volume and the other extents.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = StridedLayout.dense((5, 3, 4), 1)
+            assert layout.reshaped((20, 3)) == StridedLayout.dense((20, 3), 1)
+            assert layout.reshaped((4, -1)) == StridedLayout.dense((4, 15), 1)
+            assert layout.permuted((2, 0, 1)).reshaped((4, 15,)) == StridedLayout((4, 15), (1, 4), 1)
+            # layout.permuted((2, 0, 1)).reshaped((20, 3)) -> error
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        cdef BaseLayout new_shape
+        init_base_layout(new_shape, len(shape))
+        for i in range(len(shape)):
+            new_shape.shape[i] = shape[i]
+        self.reshape_into(new_layout, new_shape)
+        return new_layout
+
+    def permuted(self : StridedLayout, axis_order : tuple[int]) -> StridedLayout:
+        """
+        Returns a new layout where the shape and strides tuples are permuted
+        according to the specified permutation of axes.
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        cdef axis_vec_t axis_order_vec
+        _tuple2axis_vec(axis_order_vec, axis_order)
+        self.permute_into(new_layout, axis_order_vec)
+        return new_layout
+
+    def flattened(self : StridedLayout, start_axis : int = 0, end_axis : int = -1, mask : int | None = None) -> StridedLayout:
+        """
+        Merges consecutive extents into a single extent (equal to the product of merged extents)
+        if the corresponding strides can be replaced with a single stride
+        (assuming indices are iterated in C-order, i.e. the rightmost
+        axis is incremented first).
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # the two extents can be merged into a single extent
+            # because layout.strides[0] == layout.strides[1] * layout.shape[1]
+            layout = StridedLayout((3, 2), (2, 1), 1)
+            assert layout.flattened() == StridedLayout((6,), (1,), 1)
+
+            # the two extents cannot be merged into a single extent
+            # because layout.strides[0] != layout.strides[1] * layout.shape[1]
+            layout = StridedLayout((3, 2), (1, 3), 1)
+            assert layout.flattened() == layout
+
+        If ``start_axis`` and ``end_axis`` are provided, only the axes in the
+        inclusive range ``[start_axis, end_axis]`` are considered for flattening.
+
+        Alternatively, a mask specifying which axes to consider can be provided.
+        A mask of mergeable extents can be obtained using the :meth:`flattened_axis_mask` method.
+        Masks for layouts with the same number of dimensions can be combined
+        using the logical ``&`` (bitwise AND) operator.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            layout = StridedLayout.dense((4, 5, 3), 4)
+            layout2 = StridedLayout((4, 5, 3), (1, 12, 4), 4)
+            # Even though the two layouts have the same shape initially,
+            # their shapes differ after flattening.
+            assert layout.flattened() == StridedLayout((60,), (1,), 4)
+            assert layout2.flattened() == StridedLayout((4, 15), (1, 4), 4)
+            # With the mask, only extents that are mergeable in both layouts are flattened
+            # and the resulting shape is the same for both layouts.
+            mask = layout.flattened_axis_mask() & layout2.flattened_axis_mask()
+            assert layout.flattened(mask=mask) == StridedLayout((4, 15), (15, 1), 4)
+            assert layout2.flattened(mask=mask) == StridedLayout((4, 15), (1, 4), 4)
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        cdef axes_mask_t axis_mask
+        if mask is None:
+            axis_mask = axis_mask_from_range(self.ndim, start_axis, end_axis)
+        else:
+            axis_mask = mask
+        self.flatten_into(new_layout, axis_mask)
+        return new_layout
+
+    def squeezed(self : StridedLayout) -> StridedLayout:
+        """
+        Returns a new layout where all the singleton dimensions (extents equal to 1)
+        are removed. Additionally, if the layout volume is 0,
+        the returned layout will be reduced to a 1-dim layout
+        with shape (0,) and strides (0,).
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        self.squeeze_into(new_layout)
+        return new_layout
+
+    def unsqueezed(self : StridedLayout, axis : int | tuple[int]) -> StridedLayout:
+        """
+        Returns a new layout where the specified axis or axes are added as singleton extents.
+        The ``axis`` can be either a single integer in range ``[0, ndim]``
+        or a tuple of unique integers in range ``[0, ndim + len(axis) - 1]``.
+        """
+        cdef axis_vec_t axis_vec
+        if isinstance(axis, int):
+            axis_vec.push_back(axis)
+        else:
+            _tuple2axis_vec(axis_vec, axis)
+        if axis_vec.size() == 0:
+            return self
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        self.unsqueeze_into(new_layout, axis_vec)
+        return new_layout
+
+    def broadcast_to(self : StridedLayout, shape : tuple[int]) -> StridedLayout:
+        """
+        Returns a layout with the new shape, if the old shape can be
+        broadcast to the new one.
+
+        The shapes are compatible if:
+            * the new shape has the same or greater number of dimensions
+            * starting from the right, each extent in the old shape must be 1 or
+              equal to the corresponding extent in the new shape.
+
+        Strides of the added or modified extents are set to 0, the remaining ones are unchanged.
+        If the shapes are not compatible, a ValueError is raised.
+        """
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        cdef BaseLayout new_shape
+        cdef int new_ndim = len(shape)
+        init_base_layout(new_shape, new_ndim)
+        for i in range(new_ndim):
+            new_shape.shape[i] = shape[i]
+        self.broadcast_into(new_layout, new_shape)
+        return new_layout
+
+    def repacked(self : StridedLayout, itemsize : int, data_ptr : uintptr_t = 0, axis : int = -1, keep_dim : bool = True) -> StridedLayout:
+        """
+        Converts the layout to match the specified itemsize.
+        If ``new_itemsize < itemsize``, each element of the tensor is **unpacked** into multiple elements,
+        i.e. the extent at ``axis`` increases by the factor ``itemsize // new_itemsize``.
+        If ``new_itemsize > itemsize``, the consecutive elements in the tensor are **packed** into a single element,
+        i.e. the extent at ``axis`` decreases by the factor ``new_itemsize // itemsize``.
+        In either case, the ``volume * itemsize`` of the layout remains the same.
+
+        The conversion is subject to the following constraints:
+            * The old and new itemsizes must be powers of two.
+            * The extent at ``axis`` must be a positive integer.
+            * The stride at ``axis`` must be 1.
+
+        Moreover, if the ``new_itemsize > itemsize``:
+            * The extent at ``axis`` must be divisible by ``new_itemsize // itemsize``.
+            * All other strides must be divisible by ``new_itemsize // itemsize``.
+            * The ``slice_offset`` must be divisible by ``new_itemsize // itemsize``.
+            * If ``data_ptr`` is provided, it must be aligned to the new itemsize.
+
+        The maximum itemsize that satisfies all the constraints
+        can be obtained using the :meth:`max_compatible_itemsize` method.
+
+        If the ``keep_dim`` is False and the extent at ``axis`` would be reduced to 1,
+        it is omitted from the returned layout.
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Repacking the layout with itemsize = 4 bytes as 2, 8, and 16 sized layouts.
+            layout = StridedLayout.dense((5, 4), 4)
+            assert layout.repacked(2) == StridedLayout.dense((5, 8), 2)
+            assert layout.repacked(8) == StridedLayout.dense((5, 2), 8)
+            assert layout.repacked(16) == StridedLayout.dense((5, 1), 16)
+            assert layout.repacked(16, keep_dim=False) == StridedLayout.dense((5,), 16)
+
+
+        .. highlight:: python
+        .. code-block:: python
+
+            # Viewing (5, 6) float array as (5, 3) complex64 array.
+            a = numpy.ones((5, 6), dtype=numpy.float32)
+            float_view = StridedMemoryView(a, -1)
+            layout = float_view.layout
+            assert layout.shape == (5, 6)
+            assert layout.itemsize == 4
+            complex_view = float_view.view(layout.repacked(8), numpy.complex64)
+            assert complex_view.layout.shape == (5, 3)
+            assert complex_view.layout.itemsize == 8
+            b = numpy.from_dlpack(complex_view)
+            assert b.shape == (5, 3)
+        """
+
+        if itemsize == self.itemsize:
+            return self
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        if itemsize > self.itemsize:
+            self.pack_into(new_layout, itemsize, data_ptr, keep_dim, axis)
+        else:
+            self.unpack_into(new_layout, itemsize, axis)
+        return new_layout
+
+    def max_compatible_itemsize(self : StridedLayout, max_itemsize : int = 16, data_ptr : uintptr_t = 0, axis : int = -1) -> int:
+        """
+        Returns the maximum itemsize (but no greater than ``max_itemsize``) that can be used
+        with the :meth:`repacked` method for the current layout.
+        """
+        return self.get_max_compatible_itemsize(max_itemsize, data_ptr, axis)
+
+    def sliced(self : StridedLayout, slices : int | slice | tuple[int | slice]) -> StridedLayout:
+        """
+        Returns a sliced layout.
+        The ``slices`` parameter can be a single integer, a single :py:class:`slice` object
+        or a tuple of integers/slices.
+
+        .. hint::
+            For convenience, instead of calling this method directly, please rely
+            on the :py:meth:`~object.__getitem__` operator (i.e. bracket syntax), e.g.:
+            ``layout[:, start:end:step]``.
+
+        .. note::
+            Slicing is purely a layout transformation and does not involve
+            any data access.
+
+        """
+        if not isinstance(slices, tuple):
+            slices = (slices,)
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        self.slice_into(new_layout, slices)
+        return new_layout
+
+    def __getitem__(self : StridedLayout, slices : int | slice | tuple[int | slice]) -> StridedLayout:
+        return self.sliced(slices)
+
+    cdef axes_mask_t get_flattened_axis_mask(StridedLayout self) except? -1 nogil:
+        return flattened_strides_in_c_index_order_mask(self.base)
+
+    cdef int get_max_compatible_itemsize(StridedLayout self, int max_itemsize, uintptr_t data_ptr, int axis=-1) except -1 nogil:
+        return max_compatible_itemsize(self.base, self.slice_offset, self.itemsize, max_itemsize, data_ptr, axis)
+
+    cdef int reshape_into(StridedLayout self, StridedLayout out_layout, BaseLayout& new_shape) except -1 nogil:
+        cdef int64_t old_volume = self.get_volume()
+
+        validate_reshaped_shape(new_shape, old_volume)
+        _zero_strides(new_shape)
+
+        cdef BaseLayout flattened
+        if old_volume != 0:
+            flatten_strides_in_c_index_order(flattened, self.base, AXIS_MASK_ALL)
+            if not split_strides_in_c_index_order(new_shape, flattened):
+                raise ValueError("Layout strides are incompatible with the new shape")
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Copy preserved attributes
+        out_layout.slice_offset = self.slice_offset
+        out_layout.itemsize = self.itemsize
+        maybe_copy_volume(out_layout, self)
+
+        # Set new attributes
+        _swap_layout(out_layout.base, new_shape)
+        return 0
+
+    cdef int permute_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_order) except -1 nogil:
+        if axis_order.size() != <size_t>self.base.ndim:
+            raise ValueError(f"Permutation must have the same length as the number of dimensions, got {axis_order.size()} for {self.ndim}D tensor.")
+
+        cdef BaseLayout permuted
+        permute_extents(permuted, self.base, axis_order)
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+        out_layout.slice_offset = self.slice_offset
+        maybe_copy_volume(out_layout, self)
+
+        # Set new attributes
+        _swap_layout(out_layout.base, permuted)
+        return 0
+
+    cdef int flatten_into(StridedLayout self, StridedLayout out_layout, axes_mask_t axis_mask=AXIS_MASK_ALL) except -1 nogil:
+        cdef BaseLayout flattened
+        cdef int ndim = flatten_strides_in_c_index_order(flattened, self.base, axis_mask)
+
+        if out_layout is self and ndim == self.base.ndim:
+            return 0
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+        out_layout.slice_offset = self.slice_offset
+        maybe_copy_volume(out_layout, self)
+
+        # Set new attributes
+        _swap_layout(out_layout.base, flattened)
+        return 0
+
+    cdef int squeeze_into(StridedLayout self, StridedLayout out_layout) except -1 nogil:
+        cdef BaseLayout squeezed
+        squeeze_extents(squeezed, self.base)
+
+        if out_layout is self and squeezed.ndim == self.base.ndim:
+            return 0
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+        out_layout.slice_offset = self.slice_offset
+        maybe_copy_volume(out_layout, self)
+
+        # Set new attributes
+        _swap_layout(out_layout.base, squeezed)
+        return 0
+
+    cdef int unsqueeze_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_vec) except -1 nogil:
+        if axis_vec.size() == 0 and self is out_layout:
+            return 0
+
+        cdef BaseLayout unsqueezed
+        unsqueeze_extents(unsqueezed, self.base, axis_vec)
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+        out_layout.slice_offset = self.slice_offset
+        maybe_copy_volume(out_layout, self)
+
+        # Set new attributes
+        _swap_layout(out_layout.base, unsqueezed)
+        return 0
+
+    cdef int broadcast_into(StridedLayout self, StridedLayout out_layout, BaseLayout& broadcast) except -1 nogil:
+        _validate_shape(broadcast)
+        broadcast_extents(broadcast, self.base)
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+        out_layout.slice_offset = self.slice_offset
+
+        # Set new attributes
+        _swap_layout(out_layout.base, broadcast)
+        return 0
+
+    cdef int pack_into(StridedLayout self, StridedLayout out_layout, int itemsize, uintptr_t data_ptr, bint keep_dim, int axis=-1) except -1 nogil:
+
+        cdef BaseLayout packed
+        cdef stride_t new_slice_offset = 0
+        cdef int vec_size = pack_extents(
+            packed,
+            new_slice_offset,
+            self.base,
+            self.slice_offset,
+            self.itemsize,
+            itemsize,
+            data_ptr,
+            keep_dim,
+            axis
+        )
+
+        if vec_size == 1 and out_layout is self:
+            return 0
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Set new attributes
+        out_layout.itemsize = itemsize
+        out_layout.slice_offset = new_slice_offset
+        _swap_layout(out_layout.base, packed)
+        return vec_size
+
+    cdef int unpack_into(StridedLayout self, StridedLayout out_layout, int itemsize, int axis=-1) except -1 nogil:
+        cdef BaseLayout unpacked
+        cdef int vec_size = unpack_extents(
+            unpacked,
+            self.base,
+            self.itemsize,
+            itemsize,
+            axis
+        )
+        if vec_size == 1 and out_layout is self:
+            return 0
+
+        cdef int64_t new_slice_offset = _overflow_checked_mul(self.slice_offset, vec_size)
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Set new attributes
+        out_layout.itemsize = itemsize
+        out_layout.slice_offset = new_slice_offset
+        _swap_layout(out_layout.base, unpacked)
+        return vec_size
+
+    cdef int slice_into(StridedLayout self, StridedLayout out_layout, tuple slices) except -1:
+        cdef BaseLayout sliced
+        cdef stride_t slice_offset = slice_extents(sliced, self.base, slices)
+        cdef int64_t new_slice_offset = _overflow_checked_sum(self.slice_offset, slice_offset)
+
+        # Reset all memoized properties
+        out_layout._prop_mask = 0
+
+        # Preserved attributes
+        out_layout.itemsize = self.itemsize
+
+        # Set new attributes
+        _swap_layout(out_layout.base, sliced)
+        out_layout.slice_offset = new_slice_offset
+        return 0
+
+cdef inline int maybe_copy_volume(StridedLayout out_layout, StridedLayout in_layout) except -1 nogil:
+    if _has_valid_property(in_layout, PROP_VOLUME):
+        out_layout._volume = in_layout.get_volume()
+        _mark_property_valid(out_layout, PROP_VOLUME)
+    return 0
+
+
+cdef inline int validate_reshaped_shape(BaseLayout& new_shape, int64_t old_volume) except -1 nogil:
+    cdef int ndim = new_shape.ndim
+    cdef int axis = -1
+    cdef extent_t extent
+    for i in range(ndim):
+        extent = new_shape.shape[i]
+        if extent < -1:
+            raise ValueError("Extents must be non-negative")
+        elif extent == -1:
+            if axis == -1:
+                axis = i
+            else:
+                raise ValueError("There can be at most one -1 extent in a shape")
+    cdef int64_t new_volume = _c_abs(_volume(new_shape))
+    if axis == -1:
+        if new_volume != old_volume:
+            raise ValueError(f"The original volume {old_volume} and the new volume {new_volume} must be equal.")
+    else:
+        if new_volume == 0:
+            raise ValueError("The -1 extent is ambiguous when the specified sub-volume is 0")
+        extent = old_volume // new_volume
+        if extent * new_volume != old_volume:
+            raise ValueError(f"The original volume {old_volume} must be divisible by the specified sub-volume {new_volume}.")
+        new_shape.shape[axis] = extent
+    return 0
+
+
+cdef inline axes_mask_t axis_mask_from_range(int ndim, int start_axis, int end_axis) except? -1 nogil:
+    if ndim == 0 and start_axis == 0 and end_axis == -1:
+        return AXIS_MASK_ALL
+    cdef axes_mask_t axis_mask = AXIS_MASK_ALL
+    if not _normalize_axis(start_axis, ndim):
+        raise ValueError(f"Invalid start axis: {start_axis} out of range for {ndim}D tensor")
+    if not _normalize_axis(end_axis, ndim):
+        raise ValueError(f"Invalid end axis: {end_axis} out of range for {ndim}D tensor")
+    if start_axis > 0:
+        axis_mask &= (AXIS_MASK_ALL << (start_axis + 1))
+    if end_axis < ndim:
+        axis_mask &= (AXIS_MASK_ALL >> (STRIDED_LAYOUT_MAX_NDIM - end_axis - 1))
+    return axis_mask
+
+
+cdef inline int flatten_strides_in_c_index_order(BaseLayout& out_layout, BaseLayout& in_layout, axes_mask_t axis_mask) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    if ndim == 0:
+        init_base_layout(out_layout, 1)
+        out_layout.shape[0] = 1
+        out_layout.strides[0] = 1
+        return 1
+    init_base_layout(out_layout, ndim)
+    cdef int group_start = 0
+    cdef int group_end = 0
+    cdef int64_t group_vol
+    cdef int64_t group_stride
+    cdef int out_i = 0
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    while group_start < ndim:
+        group_vol = in_shape[group_start]
+        group_stride = in_strides[group_start]
+        group_end = group_start + 1
+        while (
+            group_end < ndim
+            and (axis_mask & _axis2mask(group_end))
+            and group_stride == _overflow_checked_mul(in_strides[group_end], in_shape[group_end])
+        ):
+            group_vol = _overflow_checked_mul(group_vol, in_shape[group_end])
+            group_stride = in_strides[group_end]
+            group_end += 1
+        out_layout.shape[out_i] = group_vol
+        out_layout.strides[out_i] = group_stride
+        out_i += 1
+        group_start = group_end
+    if out_i != ndim:
+        trim_base_layout(out_layout, out_i)
+    return out_i
+
+
+cdef inline axes_mask_t flattened_strides_in_c_index_order_mask(BaseLayout& layout) except? -1 nogil:
+    if layout.strides == NULL:
+        return AXIS_MASK_ALL
+    cdef axes_mask_t axis_mask = 0
+    cdef int ndim = layout.ndim
+    cdef int group_start = 0
+    cdef int group_end = 0
+    cdef int64_t group_vol
+    cdef int64_t group_stride
+    while group_start < ndim:
+        group_vol = layout.shape[group_start]
+        group_stride = layout.strides[group_start]
+        group_end = group_start + 1
+        while group_end < ndim and group_stride == layout.strides[group_end] * layout.shape[group_end]:
+            group_vol = _overflow_checked_mul(group_vol, layout.shape[group_end])
+            group_stride = layout.strides[group_end]
+            axis_mask |= _axis2mask(group_end)
+            group_end += 1
+        group_start = group_end
+    return axis_mask
+
+
+cdef inline bint split_strides_in_c_index_order(BaseLayout& out_layout, BaseLayout& in_layout) except -1 nogil:
+    cdef int i = in_layout.ndim - 1
+    cdef int new_i = out_layout.ndim - 1
+    cdef extent_t extent
+    cdef extent_t new_extent
+    cdef extent_t group_vol
+    cdef stride_t group_stride
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    if out_layout.strides == NULL:
+        _zero_strides(out_layout)
+    while i >= 0:
+        extent = in_shape[i]
+        group_stride = in_strides[i]
+        group_vol = 1
+        while new_i >= 0:
+            new_extent = out_layout.shape[new_i]
+            if new_extent == 0:
+                return False
+            if new_extent == 1 or group_vol < extent:
+                out_layout.strides[new_i] = group_stride
+                group_stride = _overflow_checked_mul(group_stride, new_extent)
+                group_vol = _overflow_checked_mul(group_vol, new_extent)
+                new_i -= 1
+            else:
+                break
+        if group_vol != extent:
+            return False
+        i -= 1
+    return True
+
+
+cdef inline int permute_extents(BaseLayout& out_layout, BaseLayout& in_layout, axis_vec_t& axis_order) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    init_base_layout(out_layout, ndim)
+    cdef axis_t axis
+    cdef axes_mask_t axis_mask
+    cdef axes_mask_t axis_order_mask = 0
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+
+    for i in range(ndim):
+        axis = axis_order[i]
+        if not _normalize_axis(axis, ndim):
+            raise ValueError(f"Invalid permutation: axis {axis} out of range for {ndim}D tensor")
+        axis_mask = _axis2mask(axis)
+        if axis_order_mask & axis_mask:
+            raise ValueError(f"Invalid permutation: axis {axis_order[i]} appears multiple times.")
+        axis_order_mask |= axis_mask
+        out_layout.shape[i] = in_shape[axis]
+        out_layout.strides[i] = in_strides[axis]
+    return 0
+
+
+cdef inline stride_t slice_extents(BaseLayout& out_layout, BaseLayout& in_layout, tuple slices) except? -1:
+    cdef int ndim = in_layout.ndim
+    cdef int num_slices = len(slices)
+    if num_slices > ndim:
+        raise ValueError(f"The number of slices ({num_slices}) is greater than the number of dimensions ({ndim}).")
+    init_base_layout(out_layout, ndim)
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    cdef stride_t slice_offset = 0
+    cdef Py_ssize_t start
+    cdef Py_ssize_t stop
+    cdef Py_ssize_t step
+    cdef extent_t new_extent
+    cdef object py_slice
+    cdef bint zero_slice = False
+    cdef int out_i = 0
+    for i in range(num_slices):
+        py_slice = slices[i]
+        if isinstance(py_slice, int):
+            start = py_slice
+            if not _normalize_axis(start, in_shape[i]):
+                raise ValueError(f"Invalid index: {start} out of range for axis {i} with extent {in_shape[i]}")
+            # single element index removes extent from the shape,
+            # just increase the offset and skip the shape and stride
+            slice_offset = _overflow_checked_sum(slice_offset, _overflow_checked_mul(start, in_strides[i]))
+        elif isinstance(py_slice, slice):
+            _PySlice_Unpack(<PyObject *>py_slice, &start, &stop, &step)
+            new_extent = _PySlice_AdjustIndices(in_shape[i], &start, &stop, step)
+            if new_extent > 0:
+                # out_extent > 0 implies start is in [0, extent - 1] range
+                slice_offset = _overflow_checked_sum(slice_offset, _overflow_checked_mul(start, in_strides[i]))
+            else:
+                zero_slice = True
+            out_layout.shape[out_i] = new_extent
+            out_layout.strides[out_i] = _overflow_checked_mul(in_strides[i], step)
+            out_i += 1
+        else:
+            raise TypeError(f"Invalid slice: {py_slice}. Expected slice instance or integer.")
+    for i in range(num_slices, ndim):
+        out_layout.shape[out_i] = in_shape[i]
+        out_layout.strides[out_i] = in_strides[i]
+        out_i += 1
+    if out_i != ndim:
+        trim_base_layout(out_layout, out_i)
+    if zero_slice:
+        _zero_strides(out_layout)
+    return slice_offset
+
+
+cdef inline int squeeze_extents(BaseLayout& out_layout, BaseLayout& in_layout) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    init_base_layout(out_layout, ndim)
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    cdef int out_i = 0
+    cdef extent_t extent
+    for i in range(ndim):
+        extent = in_shape[i]
+        if extent == 0:
+            trim_base_layout(out_layout, 1)
+            out_layout.shape[0] = 0
+            out_layout.strides[0] = 0
+            return 1
+        elif extent != 1:
+            out_layout.shape[out_i] = extent
+            out_layout.strides[out_i] = in_strides[i]
+            out_i += 1
+    if out_i != ndim:
+        trim_base_layout(out_layout, out_i)
+    return out_i
+
+
+cdef inline int unsqueeze_extents(BaseLayout& out_layout, BaseLayout& in_layout, axis_vec_t& axis_vec) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    cdef int num_new_axes = axis_vec.size()
+    cdef int out_ndim = ndim + num_new_axes
+    # init_base_layout validates out_ndim
+    init_base_layout(out_layout, out_ndim)
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    cdef axes_mask_t out_shape_mask = 0
+    cdef axes_mask_t axis_mask = 0
+    cdef axis_t axis
+    for i in range(num_new_axes):
+        axis = axis_vec[i]
+        if not _normalize_axis(axis, out_ndim):
+            raise ValueError(f"Invalid axis: {axis} out of range for {out_ndim}D tensor")
+        axis_mask = _axis2mask(axis)
+        if out_shape_mask & axis_mask:
+            raise ValueError(f"Axis {axis} appears multiple times.")
+        out_shape_mask |= axis_mask
+    cdef int in_i = 0
+    for i in range(out_ndim):
+        axis_mask = _axis2mask(<axis_t>i)
+        if out_shape_mask & axis_mask:
+            out_layout.shape[i] = 1
+            if in_i < ndim:
+                out_layout.strides[i] = _overflow_checked_mul(in_shape[in_i], in_strides[in_i])
+            else:
+                if ndim > 0:
+                    out_layout.strides[i] = in_strides[ndim - 1]
+                else:
+                    out_layout.strides[i] = 1
+        else:
+            out_layout.shape[i] = in_shape[in_i]
+            out_layout.strides[i] = in_strides[in_i]
+            in_i += 1
+    assert in_i == ndim
+    return 0
+
+
+cdef inline int broadcast_extents(BaseLayout& broadcast, BaseLayout& in_layout) except -1 nogil:
+    if broadcast.ndim < in_layout.ndim:
+        raise ValueError(
+            f"The broadcast shape ndim ({broadcast.ndim}) must be "
+            f"greater than or equal to the input shape "
+            f"ndim ({in_layout.ndim})."
+        )
+    cdef int ndim_diff = broadcast.ndim - in_layout.ndim
+    _zero_strides(broadcast)
+    cdef extent_t* in_shape = in_layout.shape
+    cdef stride_t* in_strides = get_strides_ptr(in_layout)
+    cdef extent_t* broadcast_shape = broadcast.shape + ndim_diff
+    cdef stride_t* broadcast_strides = broadcast.strides + ndim_diff
+    for i in range(in_layout.ndim):
+        if in_shape[i] == broadcast_shape[i]:
+            broadcast_strides[i] = in_strides[i]
+        elif in_shape[i] != 1:
+            raise ValueError(
+                f"Shapes cannot be broadcast together: "
+                f"the original extent must be 1 or be equal to broadcast extent, "
+                f"got {in_shape[i]} and {broadcast_shape[i]} for axis {i}."
+            )
+        # else -> in_extent == 1, the broadcast extent and zero stride are already set
+    return 0
+
+
+cdef inline int64_t gcd(int64_t a, int64_t b) except? -1 nogil:
+    while b != 0:
+        a, b = b, a % b
+    return a
+
+
+cdef inline int pack_extents(BaseLayout& out_layout, stride_t& out_slice_offset, BaseLayout& in_layout, stride_t slice_offset, int itemsize, int new_itemsize, uintptr_t data_ptr, bint keep_dim, int axis) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    if new_itemsize <= 0 or new_itemsize & (new_itemsize - 1):
+        raise ValueError(f"new itemsize must be a power of two, got {new_itemsize}.")
+    if itemsize <= 0 or itemsize & (itemsize - 1):
+        raise ValueError(f"itemsize must be a power of two, got {itemsize}.")
+    if new_itemsize <= itemsize:
+        if new_itemsize == itemsize:
+            return 1
+        raise ValueError(f"new itemsize ({new_itemsize}) must be greater than or equal to itemsize ({itemsize}).")
+    if not _normalize_axis(axis, ndim):
+        raise ValueError(f"Invalid axis: {axis} out of range for {ndim}D tensor")
+    if data_ptr % new_itemsize != 0:
+        raise ValueError(f"The data pointer ({data_ptr}) must be aligned to the packed itemsize ({new_itemsize}).")
+
+    cdef extent_t* shape = in_layout.shape
+    cdef stride_t* strides = get_strides_ptr(in_layout)
+    if strides[axis] != 1:
+        raise ValueError(f"The axis {axis} stride must be 1, got {strides[axis]}.")
+
+    cdef int vec_size = new_itemsize // itemsize
+    cdef extent_t packed_extent = shape[axis]
+    if packed_extent == 0:
+        raise ValueError(f"The axis {axis} extent must be non-zero, got {shape[axis]}.")
+    packed_extent //= vec_size
+    if packed_extent * vec_size != shape[axis]:
+        raise ValueError(f"The axis {axis} extent ({shape[axis]}) must be divisible by {vec_size}.")
+
+    cdef stride_t new_slice_offset = slice_offset // vec_size
+    if new_slice_offset * vec_size != slice_offset:
+        raise ValueError(f"The slice offset ({slice_offset}) must be divisible by {vec_size}.")
+    out_slice_offset = new_slice_offset
+
+    init_base_layout(out_layout, ndim)
+    cdef stride_t packed_stride
+    cdef int out_i = 0
+    for i in range(ndim):
+        if i == axis:
+            if keep_dim or packed_extent != 1:  # omit the packed axis if it is reduced to 1
+                out_layout.shape[out_i] = packed_extent
+                out_layout.strides[out_i] = 1
+                out_i += 1
+        else:
+            packed_stride = strides[i] // vec_size
+            if packed_stride * vec_size != strides[i]:
+                raise ValueError(f"The {i} axis stride ({strides[i]}) must be divisible by {vec_size}.")
+            out_layout.shape[out_i] = shape[i]
+            out_layout.strides[out_i] = packed_stride
+            out_i += 1
+    if out_i != ndim:
+        trim_base_layout(out_layout, out_i)
+    return vec_size
+
+
+cdef inline int unpack_extents(BaseLayout &out_layout, BaseLayout &in_layout, int itemsize, int new_itemsize, int axis) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    if not _normalize_axis(axis, ndim):
+        raise ValueError(f"Invalid axis: {axis} out of range for {ndim}D tensor")
+    if new_itemsize <= 0 or new_itemsize & (new_itemsize - 1):
+        raise ValueError(f"new itemsize must be a power of two, got {new_itemsize}.")
+    if itemsize <= 0 or itemsize & (itemsize - 1):
+        raise ValueError(f"itemsize must be a power of two, got {itemsize}.")
+    if new_itemsize >= itemsize:
+        if new_itemsize == itemsize:
+            return 1
+        raise ValueError(f"new itemsize ({new_itemsize}) must be less than or equal to itemsize ({itemsize}).")
+
+    cdef extent_t* shape = in_layout.shape
+    cdef stride_t* strides = get_strides_ptr(in_layout)
+    if shape[axis] == 0:
+        raise ValueError(f"The axis {axis} extent must be non-zero, got {shape[axis]}.")
+    if strides[axis] != 1:
+        raise ValueError(f"The axis {axis} stride must be 1, got {strides[axis]}.")
+
+    cdef int vec_size = itemsize // new_itemsize
+    init_base_layout(out_layout, ndim)
+    out_layout.shape[axis] = _overflow_checked_mul(shape[axis], vec_size)
+    out_layout.strides[axis] = 1
+
+    for i in range(ndim):
+        if i == axis:
+            continue
+        out_layout.shape[i] = shape[i]
+        out_layout.strides[i] = _overflow_checked_mul(strides[i], vec_size)
+    return vec_size
+
+
+cdef inline int max_compatible_itemsize(BaseLayout& layout, stride_t slice_offset, int itemsize, int max_itemsize, uintptr_t data_ptr, int axis) except? -1 nogil:
+    cdef int ndim = layout.ndim
+    if max_itemsize <= 0 or max_itemsize & (max_itemsize - 1):
+        raise ValueError(f"max_itemsize must be a power of two, got {max_itemsize}.")
+    if itemsize <= 0 or itemsize & (itemsize - 1):
+        raise ValueError(f"itemsize must be a power of two, got {itemsize}.")
+    if not _normalize_axis(axis, ndim):
+        raise ValueError(f"Invalid axis: {axis} out of range for {ndim}D tensor")
+    if max_itemsize < itemsize:
+        raise ValueError(f"max_itemsize ({max_itemsize}) cannot be less than itemsize ({itemsize}).")
+    max_itemsize = gcd(max_itemsize, _c_abs(data_ptr))
+    cdef extent_t* shape = layout.shape
+    cdef stride_t* strides = get_strides_ptr(layout)
+    if ndim < 1 or strides[axis] != 1 or shape[axis] == 0:
+        return itemsize
+    max_itemsize = gcd(max_itemsize, _overflow_checked_mul(slice_offset, itemsize))
+    max_itemsize = gcd(max_itemsize, _overflow_checked_mul(shape[axis], itemsize))
+    for i in range(ndim):
+        if i == axis:
+            continue
+        max_itemsize = gcd(max_itemsize, _overflow_checked_mul(_c_abs(strides[i]), itemsize))
+    return max_itemsize
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
index 00f8274be5..b581dcd293 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
@@ -7,14 +7,23 @@ from libc.stdint cimport uintptr_t
 from cuda.core.experimental._stream cimport Stream
 
 
+cdef struct _MemAttrs:
+    int device_id
+    bint is_device_accessible
+    bint is_host_accessible
+
+
 cdef class Buffer:
     cdef:
         uintptr_t      _ptr
         size_t         _size
         MemoryResource _memory_resource
         object         _ipc_data
+        object         _owner
         object         _ptr_obj
         Stream         _alloc_stream
+        _MemAttrs      _mem_attrs
+        bint           _mem_attrs_inited
 
 
 cdef class MemoryResource:
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index eefad08e5d..6db782eefc 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+cimport cython
 from libc.stdint cimport uintptr_t
 
 from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
@@ -12,7 +13,9 @@ from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._stream cimport Stream_accept, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
+    HANDLE_RETURN,
 )
+from cuda.bindings cimport cydriver
 
 import abc
 from typing import TypeVar, Union
@@ -48,6 +51,8 @@ cdef class Buffer:
         self._ipc_data = None
         self._ptr_obj = None
         self._alloc_stream = None
+        self._owner = None
+        self._mem_attrs_inited = False
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Buffer objects cannot be instantiated directly. "
@@ -56,15 +61,19 @@ cdef class Buffer:
     @classmethod
     def _init(
         cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
-        stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None
+        stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
+        owner : object | None = None
     ):
         cdef Buffer self = Buffer.__new__(cls)
         self._ptr = <uintptr_t>(int(ptr))
         self._ptr_obj = ptr
         self._size = size
+        if mr is not None and owner is not None:
+            raise ValueError("owner and memory resource cannot be both specified together")
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
         self._alloc_stream = <Stream>(stream) if stream is not None else None
+        self._owner = owner
         return self
 
     def __dealloc__(self):
@@ -76,7 +85,8 @@ cdef class Buffer:
 
     @staticmethod
     def from_handle(
-        ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None
+        ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
+        owner: object | None = None,
     ) -> Buffer:
         """Create a new :class:`Buffer` object from a pointer.
 
@@ -88,9 +98,13 @@ cdef class Buffer:
             Memory size of the buffer
         mr : :obj:`~_memory.MemoryResource`, optional
             Memory resource associated with the buffer
+        owner : object, optional
+            An object holding external allocation that the ``ptr`` points to.
+            The reference is kept as long as the buffer is alive.
+            The ``owner`` and ``mr`` cannot be specified together.
         """
         # TODO: It is better to take a stream for latter deallocation
-        return Buffer._init(ptr, size, mr=mr)
+        return Buffer._init(ptr, size, mr=mr, owner=owner)
 
     @classmethod
     def from_ipc_descriptor(
@@ -228,7 +242,9 @@ cdef class Buffer:
         """Return the device ordinal of this buffer."""
         if self._memory_resource is not None:
             return self._memory_resource.device_id
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+        else:
+            Buffer_init_mem_attrs(self)
+            return self._mem_attrs.device_id
 
     @property
     def handle(self) -> DevicePointerT:
@@ -252,14 +268,18 @@ cdef class Buffer:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_device_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+        else:
+            Buffer_init_mem_attrs(self)
+            return self._mem_attrs.is_device_accessible
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_host_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+        else:
+            Buffer_init_mem_attrs(self)
+            return self._mem_attrs.is_host_accessible
 
     @property
     def is_mapped(self) -> bool:
@@ -277,20 +297,93 @@ cdef class Buffer:
         """Return the memory size of this buffer."""
         return self._size
 
+    @property
+    def owner(self) -> object:
+        """Return the object holding external allocation."""
+        return self._owner
+
 
 # Buffer Implementation
 # ---------------------
 cdef inline void Buffer_close(Buffer self, stream):
     cdef Stream s
-    if self._ptr and self._memory_resource is not None:
-        s = Stream_accept(stream) if stream is not None else self._alloc_stream
-        self._memory_resource.deallocate(self._ptr, self._size, s)
+    if self._ptr:
+        if self._memory_resource is not None:
+            s = Stream_accept(stream) if stream is not None else self._alloc_stream
+            self._memory_resource.deallocate(self._ptr, self._size, s)
         self._ptr = 0
         self._memory_resource = None
+        self._owner = None
         self._ptr_obj = None
         self._alloc_stream = None
 
 
+cdef Buffer_init_mem_attrs(Buffer self):
+    if not self._mem_attrs_inited:
+        query_memory_attrs(self._mem_attrs, self._ptr)
+        self._mem_attrs_inited = True
+
+
+cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
+    cdef unsigned int memory_type = 0
+    cdef int is_managed = 0
+    cdef int device_id = 0
+    _query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr)
+
+    if memory_type == 0:
+        # unregistered host pointer
+        out.is_host_accessible = True
+        out.is_device_accessible = False
+        out.device_id = -1
+    # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE,
+    # so we need to check it first not to falsely claim it is not
+    # host accessible.
+    elif (
+        is_managed
+        or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
+    ):
+        # For pinned memory allocated with cudaMallocHost or paged-locked
+        # with cudaHostRegister, the memory_type is
+        # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST.
+        # TODO(ktokarski): In some cases, the registered memory requires
+        # using different ptr for device and host, we could check
+        # cuMemHostGetDevicePointer and
+        # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
+        # to double check the device accessibility.
+        out.is_host_accessible = True
+        out.is_device_accessible = True
+        out.device_id = device_id
+    elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
+        out.is_host_accessible = False
+        out.is_device_accessible = True
+        out.device_id = device_id
+    else:
+        raise ValueError(f"Unsupported memory type: {memory_type}")
+    return 0
+
+
+cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil:
+    cdef cydriver.CUpointer_attribute attrs[3]
+    cdef uintptr_t vals[3]
+    attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+    attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
+    attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+    vals[0] = <uintptr_t><void*>&memory_type
+    vals[1] = <uintptr_t><void*>&is_managed
+    vals[2] = <uintptr_t><void*>&device_id
+
+    cdef cydriver.CUresult ret
+    ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
+        with cython.gil:
+            # Device class handles the cuInit call internally
+            from cuda.core.experimental import Device
+            Device()
+        ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    HANDLE_RETURN(ret)
+    return 0
+
+
 cdef class MemoryResource:
     """Abstract base class for memory resources that manage allocation and
     deallocation of buffers.
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 40d70ad995..6a0d1c0d49 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -3,6 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from ._dlpack cimport *
+from libc.stdint cimport uintptr_t
+from cuda.core.experimental._layout cimport StridedLayout
+from cuda.core.experimental._stream import Stream
 
 import functools
 from typing import Optional
@@ -10,21 +13,24 @@ from typing import Optional
 import numpy
 
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
-from cuda.core.experimental._utils cimport cuda_utils
 
 
+from cuda.core.experimental._memory import Buffer
+
 # TODO(leofang): support NumPy structured dtypes
 
 
 cdef class StridedMemoryView:
-    """A dataclass holding metadata of a strided dense array/tensor.
+    """A class holding metadata of a strided dense array/tensor.
 
-    A :obj:`StridedMemoryView` instance can be created in two ways:
+    A :obj:`StridedMemoryView` instance can be created in three ways:
 
       1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended)
-      2. Explicit construction, see below
+      2. Explicit construction relying on DLPack or CUDA Array Interface, see below.
+      3. From :obj:`~_memory.Buffer` and a :obj:`StridedLayout` (see :meth:`from_buffer` classmethod)
 
-    This object supports both DLPack (up to v1.0) and CUDA Array Interface
+    ``StridedMemoryView(obj, stream_ptr)`` can be used to create a view from
+    objects supporting either DLPack (up to v1.0) or CUDA Array Interface
     (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol
     first, then the CAI protocol. A :obj:`BufferError` is raised if neither is
     supported.
@@ -42,16 +48,20 @@ cdef class StridedMemoryView:
     consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be
     internally passed to ``obj.__dlpack__()`` instead.
 
-    Attributes
+    Parameters
     ----------
+    obj : Any
+        Any objects that supports either DLPack (up to v1.0) or CUDA Array
+        Interface (v3).
+    stream_ptr: int
+        The pointer address (as Python `int`) to the **consumer** stream.
+        Stream ordering will be properly established unless ``-1`` is passed.
+
+
+    Attributes
+    -----------
     ptr : int
         Pointer to the tensor buffer (as a Python `int`).
-    shape : tuple
-        Shape of the tensor.
-    strides : Optional[tuple]
-        Strides of the tensor (in **counts**, not bytes).
-    dtype: numpy.dtype
-        Data type of the tensor.
     device_id : int
         The device ID for where the tensor is located. It is -1 for CPU tensors
         (meaning those only accessible from the host).
@@ -61,18 +71,12 @@ cdef class StridedMemoryView:
         Whether the tensor data can be modified in place.
     exporting_obj : Any
         A reference to the original tensor object that is being viewed.
+        If the view is created with :meth:`from_buffer`,
+        it will be the Buffer instance passed to the method.
 
-    Parameters
-    ----------
-    obj : Any
-        Any objects that supports either DLPack (up to v1.0) or CUDA Array
-        Interface (v3).
-    stream_ptr: int
-        The pointer address (as Python `int`) to the **consumer** stream.
-        Stream ordering will be properly established unless ``-1`` is passed.
     """
     cdef readonly:
-        intptr_t ptr
+        uintptr_t ptr
         int device_id
         bint is_device_accessible
         bint readonly
@@ -88,11 +92,18 @@ cdef class StridedMemoryView:
     cdef DLTensor *dl_tensor
 
     # Memoized properties
-    cdef tuple _shape
-    cdef tuple _strides
-    cdef bint _strides_init  # Has the strides tuple been init'ed?
+    # Either lazily inferred from dl_tensor/metadata,
+    # or explicitly provided if created with from_buffer().
+    cdef StridedLayout _layout
+    # Either exporting_obj if it is a Buffer, otherwise a Buffer instance
+    # with owner set to the exporting object.
+    cdef object _buffer
+    # Either lazily inferred from dl_tensor/metadata,
+    # or explicitly provided if created with from_buffer().
+    # In the latter case, it can be None.
     cdef object _dtype
 
+
     def __init__(self, obj=None, stream_ptr=None):
         if obj is not None:
             # populate self's attributes
@@ -120,63 +131,191 @@ cdef class StridedMemoryView:
             dlm_tensor = <DLManagedTensor*>data
             dlm_tensor.deleter(dlm_tensor)
 
+    @classmethod
+    def from_buffer(
+        cls, buffer : Buffer, layout : StridedLayout,
+        dtype : numpy.dtype | None = None,
+        is_readonly : bool = False
+    ) -> StridedMemoryView:
+        """
+        Creates a :obj:`StridedMemoryView` instance from a :obj:`~_memory.Buffer` and a :obj:`StridedLayout`.
+        The Buffer can be either allocation coming from a :obj:`MemoryResource` or an external allocation
+        wrapped in a :obj:`~_memory.Buffer` object with ``Buffer.from_handle(ptr, size, owner=...)``.
+
+        .. hint::
+            When allocating the memory for a given layout, the required allocation size
+            can be obtained with the :meth:`StridedLayout.required_size_in_bytes` method.
+            It is best to use the :meth:`StridedLayout.to_dense` method
+            first to make sure the layout is contiguous, to avoid overallocating memory
+            for layouts with gaps.
+
+        .. caution::
+            When creating a :obj:`StridedMemoryView` from a :obj:`~_memory.Buffer`,
+            no synchronization is performed. It is the user's responsibility to ensure
+            the data in ``buffer`` is properly synchronized when consuming the view.
+
+        Parameters
+        ----------
+        buffer : :obj:`~_memory.Buffer`
+            The buffer to create the view from.
+        layout : :obj:`StridedLayout`
+            The layout describing the shape, strides and itemsize of the elements in
+            the buffer.
+        dtype : :obj:`numpy.dtype`, optional
+            Optional dtype.
+            If specified, the dtype's itemsize must match the layout's itemsize.
+            To view the buffer with a different itemsize, please use :meth:`StridedLayout.repacked`
+            first to transform the layout to the desired itemsize.
+        is_readonly : bool, optional
+            Whether the mark the view as readonly.
+        """
+        cdef StridedMemoryView view = StridedMemoryView.__new__(cls)
+        view_buffer_strided(view, buffer, layout, dtype, is_readonly)
+        return view
+
+    def view(
+        self, layout : StridedLayout | None = None, dtype : numpy.dtype | None = None
+    ) -> StridedMemoryView:
+        """
+        Creates a new view with adjusted layout and dtype.
+        Same as calling :meth:`from_buffer` with the current buffer.
+        """
+        cdef StridedMemoryView view = StridedMemoryView.__new__(self.__class__)
+        if layout is None and dtype is None:
+            return self
+        if layout is None:
+            layout = self.get_layout()
+        if dtype is None:
+            dtype = self.get_dtype()
+        view_buffer_strided(view, self.get_buffer(), layout, dtype, self.readonly)
+        return view
+
+    def copy_from(
+        self, other : StridedMemoryView, stream : Stream,
+        allocator = None,
+        blocking : bool | None = None,
+    ):
+        """
+        Copies the data from the other view into this view.
+
+        The copy can be performed between following memory spaces:
+        host-to-device, device-to-host, device-to-device (on the same device).
+
+        The following conditions must be met:
+            * Both views must have compatible shapes, i.e. the shapes must be equal
+              or the source view's shape must be broadcastable to the target view's shape
+              (see :meth:`StridedLayout.broadcast_to`).
+            * Both views must have the same :attr:`dtype` (or :attr:`StridedLayout.itemsize`
+              if :attr:`dtype` is not specified).
+            * The destination's layout must be unique (see :meth:`StridedLayout.is_unique`).
+
+        Parameters
+        ----------
+        other : StridedMemoryView
+            The view to copy data from.
+        stream : Stream | None, optional
+            The stream to schedule the copy on.
+        allocator : MemoryResource | None, optional
+            If temporary buffers are needed, the specifed memory resources
+            will be used to allocate the memory. If not specified, default
+            resources will be used.
+        blocking : bool | None, optional
+            Whether the call should block until the copy is complete.
+                * ``True``: the ``stream`` is synchronized with the host at the end of the call,
+                  blocking until the copy is complete.
+                * ``False``: if possible, the call returns immediately once the copy is scheduled.
+                  However, in some cases of host-to-device or device-to-host copies, the call may
+                  still synchronize with the host if necessary.
+                * ``None`` (default):
+                    * for device-to-device, it defaults to ``False`` (non-blocking),
+                    * for host-to-device or device-to-host, it defaults to ``True`` (blocking).
+        """
+        raise NotImplementedError("Sorry, not supported: copy_from")
+
+    def copy_to(
+        self, other : StridedMemoryView, stream : Stream | None = None,
+        allocator = None,
+        blocking : bool | None = None,
+    ):
+        """
+        Copies the data from this view into the other view.
+
+        For details, see :meth:`copy_from`.
+        """
+        raise NotImplementedError("Sorry, not supported: copy_to")
+
+    @property
+    def layout(self) -> StridedLayout:
+        """
+        The layout of the tensor. For StridedMemoryView created from DLPack or CAI,
+        the layout is inferred from the tensor object's metadata.
+        """
+        return self.get_layout()
+
     @property
     def shape(self) -> tuple[int]:
-        if self._shape is None:
-            if self.exporting_obj is not None:
-                if self.dl_tensor != NULL:
-                    self._shape = cuda_utils.carray_int64_t_to_tuple(
-                        self.dl_tensor.shape,
-                        self.dl_tensor.ndim
-                    )
-                else:
-                    self._shape = self.metadata["shape"]
-            else:
-                self._shape = ()
-        return self._shape
+        """
+        Shape of the tensor.
+        """
+        return self.get_layout().get_shape_tuple()
 
     @property
     def strides(self) -> Optional[tuple[int]]:
-        cdef int itemsize
-        if self._strides_init is False:
-            if self.exporting_obj is not None:
-                if self.dl_tensor != NULL:
-                    if self.dl_tensor.strides:
-                        self._strides = cuda_utils.carray_int64_t_to_tuple(
-                            self.dl_tensor.strides,
-                            self.dl_tensor.ndim
-                        )
-                else:
-                    # This is a Python interface anyway, so not much point
-                    # to using the optimization in cuda_utils.carray_int64_t_to_tuple
-                    strides = self.metadata.get("strides")
-                    if strides is not None:
-                        itemsize = self.dtype.itemsize
-                        self._strides = tuple(x // itemsize for x in strides)
-            self._strides_init = True
-        return self._strides
+        """
+        Strides of the tensor (in **counts**, not bytes).
+        """
+        return self.get_layout().get_strides_tuple()
 
     @property
     def dtype(self) -> Optional[numpy.dtype]:
-        if self._dtype is None:
-            if self.exporting_obj is not None:
-                if self.dl_tensor != NULL:
-                    self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype)
-                else:
-                    # TODO: this only works for built-in numeric types
-                    self._dtype = numpy.dtype(self.metadata["typestr"])
-        return self._dtype
+        """
+        Data type of the tensor.
+        """
+        return self.get_dtype()
 
     def __repr__(self):
         return (f"StridedMemoryView(ptr={self.ptr},\n"
               + f"                  shape={self.shape},\n"
               + f"                  strides={self.strides},\n"
+              + f"                  itemsize={self.layout.itemsize},\n"
               + f"                  dtype={get_simple_repr(self.dtype)},\n"
               + f"                  device_id={self.device_id},\n"
               + f"                  is_device_accessible={self.is_device_accessible},\n"
               + f"                  readonly={self.readonly},\n"
               + f"                  exporting_obj={get_simple_repr(self.exporting_obj)})")
 
+    cdef inline StridedLayout get_layout(self):
+        if self._layout is None:
+            if self.dl_tensor:
+                self._layout = layout_from_dlpack(self.dl_tensor)
+            elif self.metadata is not None:
+                self._layout = layout_from_cai(self.metadata)
+            else:
+                raise ValueError("Cannot infer layout from the exporting object")
+        return self._layout
+
+    cdef inline object get_buffer(self):
+        """
+        Returns Buffer instance with the underlying data.
+        If the SMV was created from a Buffer, it will return the same Buffer instance.
+        Otherwise, it will create a new instance with owner set to the exporting object.
+        """
+        if self._buffer is None:
+            if isinstance(self.exporting_obj, Buffer):
+                self._buffer = self.exporting_obj
+            else:
+                self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj)
+        return self._buffer
+
+    cdef inline object get_dtype(self):
+        if self._dtype is None:
+            if self.dl_tensor != NULL:
+                self._dtype = dtype_dlpack_to_numpy(&self.dl_tensor.dtype)
+            elif self.metadata is not None:
+                # TODO: this only works for built-in numeric types
+                self._dtype = _typestr2dtype[self.metadata["typestr"]]
+        return self._dtype
+
 
 cdef str get_simple_repr(obj):
     # TODO: better handling in np.dtype objects
@@ -285,7 +424,7 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     cdef StridedMemoryView buf = StridedMemoryView() if view is None else view
     buf.dl_tensor = dl_tensor
     buf.metadata = capsule
-    buf.ptr = <intptr_t>(dl_tensor.data)
+    buf.ptr = <uintptr_t>(dl_tensor.data)
     buf.device_id = device_id
     buf.is_device_accessible = is_device_accessible
     buf.readonly = is_readonly
@@ -294,6 +433,27 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None):
     return buf
 
 
+_builtin_numeric_dtypes = [
+    numpy.dtype("uint8"),
+    numpy.dtype("uint16"),
+    numpy.dtype("uint32"),
+    numpy.dtype("uint64"),
+    numpy.dtype("int8"),
+    numpy.dtype("int16"),
+    numpy.dtype("int32"),
+    numpy.dtype("int64"),
+    numpy.dtype("float16"),
+    numpy.dtype("float32"),
+    numpy.dtype("float64"),
+    numpy.dtype("complex64"),
+    numpy.dtype("complex128"),
+    numpy.dtype("bool"),
+]
+# Doing it once to avoid repeated overhead
+_typestr2dtype = {dtype.str: dtype for dtype in _builtin_numeric_dtypes}
+_typestr2itemsize = {dtype.str: dtype.itemsize for dtype in _builtin_numeric_dtypes}
+
+
 cdef object dtype_dlpack_to_numpy(DLDataType* dtype):
     cdef int bits = dtype.bits
     if dtype.lanes != 1:
@@ -375,13 +535,13 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
             buf.ptr))
 
-    cdef intptr_t producer_s, consumer_s
+    cdef uintptr_t producer_s, consumer_s
     stream_ptr = int(stream_ptr)
     if stream_ptr != -1:
         stream = cai_data.get("stream")
         if stream is not None:
-            producer_s = <intptr_t>(stream)
-            consumer_s = <intptr_t>(stream_ptr)
+            producer_s = <uintptr_t>(stream)
+            consumer_s = <uintptr_t>(stream_ptr)
             assert producer_s > 0
             # establish stream order
             if producer_s != consumer_s:
@@ -433,3 +593,69 @@ def args_viewable_as_strided_memory(tuple arg_indices):
             return func(*args, **kwargs)
         return wrapped_func
     return wrapped_func_with_indices
+
+
+cdef inline StridedLayout layout_from_dlpack(DLTensor* dl_tensor):
+    cdef StridedLayout layout = StridedLayout.__new__(StridedLayout)
+    cdef int nbits = dl_tensor.dtype.bits * dl_tensor.dtype.lanes
+    cdef int itemsize = nbits >> 3
+    if (itemsize << 3) != nbits:
+        raise ValueError("dl_tensor.dtype.bits must be a multiple of 8")
+    layout.init_from_ptr(dl_tensor.ndim, dl_tensor.shape, dl_tensor.strides, itemsize)
+    return layout
+
+
+cdef StridedLayout layout_from_cai(object metadata):
+    cdef StridedLayout layout = StridedLayout.__new__(StridedLayout)
+    cdef object shape = metadata["shape"]
+    cdef object strides = metadata.get("strides")
+    cdef int itemsize = _typestr2itemsize[metadata["typestr"]]
+    layout.init_from_tuple(shape, strides, itemsize, True)
+    return layout
+
+
+cdef inline uintptr_t _get_data_ptr(object buffer, StridedLayout layout) except? 0:
+    cdef bint is_allocated = buffer.memory_resource is not None
+    # Check the layout's offset range [min_offset, max_offset] fits
+    # within the [0, buffer.size - 1] range.
+    # The required_size_in_bytes fails if min_offset < 0.
+    # NB. For external memory, both positive and negative offsets can be valid,
+    # but for a proper check we'd need to know both size and data offset,
+    # while neither is reported by the packages.
+    if is_allocated and buffer.size < layout.get_required_size_in_bytes():
+        raise ValueError(
+            f"Buffer size is too small for the layout. "
+            f"Expected at least {layout.get_required_size_in_bytes()} bytes, "
+            f"got {buffer.size} bytes."
+        )
+    return <uintptr_t>(int(buffer.handle)) + layout.get_slice_offset_in_bytes()
+
+
+cdef inline int view_buffer_strided(
+    StridedMemoryView view,
+    object buffer,
+    StridedLayout layout,
+    object dtype,
+    bint is_readonly,
+) except -1:
+    if dtype is not None:
+        dtype = numpy.dtype(dtype)
+        if dtype.itemsize != layout.itemsize:
+            raise ValueError(
+                f"The dtype's itemsize ({dtype.itemsize}) does not match the layout's "
+                f"itemsize ({layout.itemsize}). Please use :meth:`StridedLayout.repacked` "
+                f"to transform the layout to the desired itemsize."
+            )
+    # set the public attributes
+    view.ptr = _get_data_ptr(buffer, layout)
+    view.device_id = buffer.device_id
+    view.is_device_accessible = buffer.is_device_accessible
+    view.readonly = is_readonly
+    view.exporting_obj = view._buffer = buffer
+    # no dlpack/cai metadata
+    view.dl_tensor = NULL
+    view.metadata = None
+    # we get the layout from the caller
+    view._layout = layout
+    view._dtype = dtype
+    return 0
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
index 0e75202498..ce30285aa5 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -4,7 +4,7 @@
 
 cimport cpython
 from cpython.object cimport PyObject
-from libc.stdint cimport int64_t
+from libc.stdint cimport int64_t, int32_t
 
 from cuda.bindings cimport cydriver
 
@@ -13,6 +13,11 @@ ctypedef fused supported_error_type:
     cydriver.CUresult
 
 
+ctypedef fused integer_t:
+    int64_t
+    int32_t
+
+
 # mimic CU_DEVICE_INVALID
 cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 
@@ -41,7 +46,7 @@ cdef extern from "Python.h":
     void _PyTuple_SET_ITEM "PyTuple_SET_ITEM" (object p, Py_ssize_t pos, PyObject *o)
 
 
-cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length):
+cdef inline tuple carray_integer_t_to_tuple(integer_t *ptr, int length):
     # Construct shape and strides tuples using the Python/C API for speed
     cdef tuple result = cpython.PyTuple_New(length)
     for i in range(length):
diff --git a/cuda_core/cuda/core/experimental/include/layout.hpp b/cuda_core/cuda/core/experimental/include/layout.hpp
new file mode 100644
index 0000000000..58c408889e
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/layout.hpp
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_LAYOUT_HPP
+#define CUDA_CORE_LAYOUT_HPP
+
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <numeric>
+
+
+#define STRIDED_LAYOUT_MAX_NDIM 64
+#define AXIS_MASK_ALL 0xFFFFFFFFFFFFFFFEULL
+
+inline int64_t _c_abs(int64_t x)
+{
+    return std::abs(x);
+}
+
+template <typename T>
+void _swap(T &a, T &b) noexcept
+{
+    std::swap(a, b);
+}
+
+inline void _order_from_strides(std::vector<int>& indices, const int64_t* shape, const int64_t* strides, int ndim)
+{
+    indices.resize(ndim);
+    std::iota(indices.begin(), indices.end(), 0);
+    if (!strides) {
+        return;
+    }
+    std::sort(indices.begin(), indices.end(),
+              [&strides, &shape](int i, int j)
+              {
+                  int64_t stride_i = _c_abs(strides[i]);
+                  int64_t stride_j = _c_abs(strides[j]);
+                  if (stride_i != stride_j)
+                  {
+                      return stride_i > stride_j;
+                  }
+                  int64_t shape_i = shape[i];
+                  int64_t shape_j = shape[j];
+                  if (shape_i != shape_j)
+                  {
+                      return shape_i > shape_j;
+                  }
+                  return i < j;
+              });
+}
+
+#endif // CUDA_CORE_LAYOUT_HPP
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index 32f62918f6..3227f1eae1 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.core.experimental._layout import StridedLayout  # noqa: F401
 from cuda.core.experimental._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
diff --git a/cuda_core/docs/source/_templates/autosummary/cyclass.rst b/cuda_core/docs/source/_templates/autosummary/cyclass.rst
new file mode 100644
index 0000000000..8728ab53ef
--- /dev/null
+++ b/cuda_core/docs/source/_templates/autosummary/cyclass.rst
@@ -0,0 +1,27 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+{{ fullname | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   {% for item in attributes %}
+   .. autoattribute:: {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block methods %}
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   {% for item in methods %}
+   .. automethod:: {{ item }}
+   {%- endfor %}
+
+   {% endif %}
+   {% endblock %}
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index d7f4d3642d..45be638eb6 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -75,6 +75,7 @@ Utility functions
 
    args_viewable_as_strided_memory
 
-   :template: dataclass.rst
+   :template: autosummary/cyclass.rst
 
    StridedMemoryView
+   StridedLayout
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index e735918c41..bab2a2b942 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -123,5 +123,23 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
         lines.extend(new_lines)
 
 
+def skip_member(app, what, name, obj, skip, options):
+    # skip undocumented attributes for modules documented
+    # with cyclass.rst template where attributes
+    # are assumed to be properties (because cythonized
+    # properties are not recognized as such by autodoc)
+    excluded_dirs = [
+        "cuda.core.experimental._layout",
+        "cuda.core.experimental._memoryview",
+    ]
+    if what == "attribute" and getattr(obj, "__doc__", None) is None:
+        obj_module = getattr(getattr(obj, "__objclass__", None), "__module__", None)
+        if obj_module in excluded_dirs:
+            print(f"Skipping undocumented attribute {name} in {obj_module}")
+            return True
+    return None
+
+
 def setup(app):
     app.connect("autodoc-process-docstring", autodoc_process_docstring)
+    app.connect("autodoc-skip-member", skip_member)
diff --git a/cuda_core/tests/helpers/layout.py b/cuda_core/tests/helpers/layout.py
new file mode 100644
index 0000000000..4067eaa6a9
--- /dev/null
+++ b/cuda_core/tests/helpers/layout.py
@@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+from enum import Enum
+
+import numpy as np
+
+
+class NamedParam:
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+    def __bool__(self):
+        return bool(self.value)
+
+    def pretty_name(self):
+        if isinstance(self.value, Enum):
+            value_str = self.value.name
+        else:
+            value_str = str(self.value)
+        return f"{self.name}.{value_str}"
+
+
+class DenseOrder(Enum):
+    """
+    Whether to initialize the dense layout in C or F order.
+    For C, the strides can be explicit or implicit (None).
+    """
+
+    C = "C"
+    IMPLICIT_C = "implicit_c"
+    F = "F"
+
+
+class LayoutSpec:
+    """
+    Pretty printable specification of a layout in a test case.
+    """
+
+    def __init__(
+        self,
+        shape,
+        itemsize,
+        stride_order=DenseOrder.C,
+        perm=None,
+        slices=None,
+        np_ref=None,
+    ):
+        self.shape = shape
+        self.itemsize = itemsize
+        self.stride_order = stride_order
+        self.perm = perm
+        if slices is not None and not isinstance(slices, list):
+            slices = [slices]
+        self.slices = slices
+        self.np_ref = np_ref
+
+    def pretty_name(self):
+        desc = [
+            f"ndim.{len(self.shape)}",
+            f"shape.{self.shape}",
+            f"itemsize.{self.itemsize}",
+        ]
+        if self.stride_order is not None:
+            if isinstance(self.stride_order, DenseOrder):
+                desc.append(f"stride_order.{self.stride_order.value}")
+            else:
+                assert isinstance(self.stride_order, tuple)
+                assert len(self.stride_order) == len(self.shape)
+                desc.append(f"stride_order.{self.stride_order}")
+        if self.perm is not None:
+            desc.append(f"perm.{self.perm}")
+        if self.slices is not None:
+            desc.append(f"slices.{self.slices}")
+        return "-".join(desc)
+
+    def dtype_from_itemsize(self):
+        return dtype_from_itemsize(self.itemsize)
+
+    def np_order(self):
+        return "F" if self.stride_order == DenseOrder.F else "C"
+
+    def has_no_strides(self):
+        return self.stride_order == DenseOrder.IMPLICIT_C
+
+    def has_no_strides_transformed(self):
+        return self.stride_order == DenseOrder.IMPLICIT_C and self.perm is None and self.slices is None
+
+
+def dtype_from_itemsize(itemsize):
+    if itemsize <= 8:
+        return np.dtype(f"int{itemsize * 8}")
+    elif itemsize == 16:
+        return np.dtype("complex128")
+    else:
+        raise ValueError(f"Unsupported itemsize: {itemsize}")
+
+
+def pretty_name(val):
+    """
+    Pytest does not pretty print (repr/str) parameters of custom types.
+    Use this function as the `ids` argument of `pytest.mark.parametrize`, e.g.:
+    ``@pytest.mark.parametrize(..., ids=pretty_name)``
+    """
+    if hasattr(val, "pretty_name"):
+        return val.pretty_name()
+    # use default pytest pretty printing
+    return None
+
+
+def flatten_mask2str(mask, ndim):
+    return "".join("1" if mask & (1 << i) else "0" for i in range(ndim))
+
+
+def random_permutations(rng, perm_len, cutoff_len=3, sample_size=6):
+    if perm_len <= cutoff_len:
+        return [perm for perm in itertools.permutations(range(perm_len))]
+    perms = []
+    for _ in range(sample_size):
+        perm = list(range(perm_len))
+        rng.shuffle(perm)
+        perms.append(tuple(perm))
+    return perms
+
+
+def inv_permutation(perm):
+    inv = [None] * len(perm)
+    for i, p in enumerate(perm):
+        inv[p] = i
+    return tuple(inv)
+
+
+def permuted(t, perm):
+    return tuple(t[i] for i in perm)
+
+
+def long_shape(rng, ndim, num_non_unit_dims=5, max_dim_size=6):
+    dims = [min(i + 2, max_dim_size) for i in range(num_non_unit_dims)]
+    dims.extend(1 for i in range(ndim - num_non_unit_dims))
+    rng.shuffle(dims)
+    return tuple(dims)
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index f536cbf78f..300aeb92a1 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -9,6 +9,7 @@
     from cuda.bindings import driver
 except ImportError:
     from cuda import cuda as driver
+
 try:
     import numpy as np
 except ImportError:
@@ -27,6 +28,9 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
+from cuda.core.experimental import (
+    system as ccx_system,
+)
 from cuda.core.experimental._dlpack import DLDeviceType
 from cuda.core.experimental._memory import IPCBufferDescriptor
 from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return
@@ -236,6 +240,141 @@ def test_buffer_close():
     buffer_close(DummyPinnedMemoryResource(device))
 
 
+def test_buffer_external_host():
+    a = (ctypes.c_byte * 20)()
+    ptr = ctypes.addressof(a)
+    buffer = Buffer.from_handle(ptr, 20, owner=a)
+    assert not buffer.is_device_accessible
+    assert buffer.is_host_accessible
+    assert buffer.device_id == -1
+    buffer.close()
+
+
+@pytest.mark.parametrize("change_device", [True, False])
+def test_buffer_external_device(change_device):
+    n = ccx_system.num_devices
+    if n < 1:
+        pytest.skip("No devices found")
+    dev_id = n - 1
+    d = Device(dev_id)
+    d.set_current()
+    buffer_ = d.allocate(size=32)
+
+    if change_device:
+        # let's switch to a different device if possibe
+        # to make sure we get the original device id
+        d = Device(0)
+        d.set_current()
+
+    buffer = Buffer.from_handle(int(buffer_.handle), 32)
+    assert buffer.is_device_accessible
+    assert not buffer.is_host_accessible
+    assert buffer.device_id == dev_id
+    buffer.close()
+    buffer_.close()
+
+
+@pytest.mark.parametrize("change_device", [True, False])
+def test_buffer_external_pinned_alloc(change_device):
+    n = ccx_system.num_devices
+    if n < 1:
+        pytest.skip("No devices found")
+    dev_id = n - 1
+    d = Device(dev_id)
+    d.set_current()
+    mr = DummyPinnedMemoryResource(d)
+    buffer_ = mr.allocate(size=32)
+
+    if change_device:
+        # let's switch to a different device if possibe
+        # to make sure we get the original device id
+        d = Device(0)
+        d.set_current()
+
+    buffer = Buffer.from_handle(int(buffer_.handle), 32)
+    assert buffer.is_device_accessible
+    assert buffer.is_host_accessible
+    assert buffer.device_id == dev_id
+    buffer.close()
+    buffer_.close()
+
+
+@pytest.mark.parametrize("change_device", [True, False])
+def test_buffer_external_pinned_registered(change_device):
+    n = ccx_system.num_devices
+    if n < 1:
+        pytest.skip("No devices found")
+    dev_id = n - 1
+    d = Device(dev_id)
+    d.set_current()
+    a = (ctypes.c_byte * 20)()
+    ptr = ctypes.addressof(a)
+
+    buffer = Buffer.from_handle(ptr, 20, owner=ptr)
+    assert not buffer.is_device_accessible
+    assert buffer.is_host_accessible
+    assert buffer.device_id == -1
+
+    handle_return(driver.cuMemHostRegister(ptr, 20, 0))
+    try:
+        if change_device:
+            # let's switch to a different device if possibe
+            # to make sure we get the original device id
+            d = Device(0)
+            d.set_current()
+
+        buffer = Buffer.from_handle(ptr, 20, owner=ptr)
+        assert buffer.is_device_accessible
+        assert buffer.is_host_accessible
+        assert buffer.device_id == dev_id
+        buffer.close()
+    finally:
+        handle_return(driver.cuMemHostUnregister(ptr))
+
+
+@pytest.mark.parametrize("change_device", [True, False])
+def test_buffer_external_managed(change_device):
+    n = ccx_system.num_devices
+    if n < 1:
+        pytest.skip("No devices found")
+    dev_id = n - 1
+    d = Device(dev_id)
+    d.set_current()
+    ptr = None
+    try:
+        ptr = handle_return(driver.cuMemAllocManaged(32, driver.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value))
+        if change_device:
+            # let's switch to a different device if possibe
+            # to make sure we get the original device id
+            d = Device(0)
+            d.set_current()
+        buffer = Buffer.from_handle(ptr, 32)
+        assert buffer.is_device_accessible
+        assert buffer.is_host_accessible
+        assert buffer.device_id == dev_id
+    finally:
+        if ptr is not None:
+            handle_return(driver.cuMemFree(ptr))
+
+
+def test_memory_resource_and_owner_disallowed():
+    with pytest.raises(ValueError, match="cannot be both specified together"):
+        a = (ctypes.c_byte * 20)()
+        ptr = ctypes.addressof(a)
+        Buffer.from_handle(ptr, 20, mr=DummyDeviceMemoryResource(Device()), owner=a)
+
+
+def test_owner_close():
+    a = (ctypes.c_byte * 20)()
+    ptr = ctypes.addressof(a)
+    before = sys.getrefcount(a)
+    buffer = Buffer.from_handle(ptr, 20, owner=a)
+    assert sys.getrefcount(a) != before
+    buffer.close()
+    after = sys.getrefcount(a)
+    assert after == before
+
+
 def test_buffer_dunder_dlpack():
     device = Device()
     device.set_current()
diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py
new file mode 100644
index 0000000000..6d59480080
--- /dev/null
+++ b/cuda_core/tests/test_strided_layout.py
@@ -0,0 +1,917 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+import math
+import random
+from enum import Enum
+
+import numpy as np
+import pytest
+from cuda.core.experimental._layout import StridedLayout
+from helpers.layout import (
+    DenseOrder,
+    LayoutSpec,
+    NamedParam,
+    dtype_from_itemsize,
+    flatten_mask2str,
+    inv_permutation,
+    long_shape,
+    permuted,
+    pretty_name,
+    random_permutations,
+)
+
+_ITEMSIZES = [1, 2, 4, 8, 16]
+_S = np.s_
+
+py_rng = random.Random(42)
+
+
+def _setup_layout_and_np_ref(spec: LayoutSpec):
+    np_ref = np.arange(math.prod(spec.shape), dtype=spec.dtype_from_itemsize())
+
+    if isinstance(spec.stride_order, DenseOrder):
+        np_ref = np_ref.reshape(spec.shape, order=spec.np_order())
+        if spec.stride_order == DenseOrder.IMPLICIT_C:
+            layout = StridedLayout(spec.shape, None, spec.itemsize)
+        else:
+            layout = StridedLayout.dense(spec.shape, spec.itemsize, spec.stride_order.value)
+    else:
+        assert isinstance(spec.stride_order, tuple)
+        assert len(spec.stride_order) == len(spec.shape)
+        # numpy does not allow specyfing the tuple order (only C/F)
+        np_ref = np_ref.reshape(permuted(spec.shape, spec.stride_order))
+        np_ref = np_ref.transpose(inv_permutation(spec.stride_order))
+        layout = StridedLayout.dense(spec.shape, spec.itemsize, spec.stride_order)
+    return layout, np_ref
+
+
+def _transform(layout: StridedLayout, np_ref: np.ndarray, spec: LayoutSpec):
+    if spec.perm is not None:
+        np_ref = np_ref.transpose(spec.perm)
+        layout = layout.permuted(spec.perm)
+    if spec.slices is not None:
+        for sl in spec.slices:
+            np_ref = np_ref[sl]
+            layout = layout.sliced(sl)
+    return layout, np_ref
+
+
+def _cmp_layout_and_array(layout: StridedLayout, arr: np.ndarray, expect_strides_none: bool):
+    """
+    Compare StridedLayout and numpy.ndarray.
+    Compares shape, strides, itemsize and contiguity flags.
+    """
+    ndim = len(arr.shape)
+    assert layout.ndim == ndim
+    assert layout.shape == arr.shape
+    volume = math.prod(arr.shape)
+    assert layout.volume == volume
+    assert layout.itemsize == arr.itemsize
+    assert layout.slice_offset * layout.itemsize == layout.slice_offset_in_bytes
+
+    ref_c_contig = arr.flags["C_CONTIGUOUS"]
+    ref_f_contig = arr.flags["F_CONTIGUOUS"]
+    assert layout.is_contiguous_c == ref_c_contig
+    assert layout.is_contiguous_f == ref_f_contig
+    ref_any_contig = ref_c_contig or ref_f_contig or arr.transpose(layout.stride_order).flags["C_CONTIGUOUS"]
+    assert layout.is_contiguous_any == ref_any_contig
+    assert layout.is_dense == (ref_any_contig and layout.slice_offset == 0)
+
+    if expect_strides_none:
+        assert layout.strides is None
+        assert layout.strides_in_bytes is None
+        assert arr.flags["C_CONTIGUOUS"]
+    elif math.prod(arr.shape) == 0:
+        assert layout.strides_in_bytes == tuple(0 for _ in range(ndim))
+    else:
+        assert layout.strides_in_bytes == arr.strides
+
+
+def _cmp_layout_from_dense_vs_from_np(layout: StridedLayout, np_ref: np.ndarray, has_no_strides: bool):
+    """
+    Compare the layout created through series of transformations vs
+    the layout created from numpy.ndarray transformed accordingly.
+    """
+
+    layout_from_np = StridedLayout(np_ref.shape, np_ref.strides, np_ref.itemsize, divide_strides=True)
+    assert layout_from_np.shape == layout.shape
+    assert layout_from_np.itemsize == layout.itemsize
+    assert layout_from_np.is_contiguous_c == layout.is_contiguous_c
+    assert layout_from_np.is_contiguous_f == layout.is_contiguous_f
+    assert layout_from_np.is_contiguous_any == layout.is_contiguous_any
+    assert layout_from_np.is_unique == layout.is_unique
+    volume = math.prod(np_ref.shape)
+    assert layout_from_np.volume == layout.volume == volume
+
+    if volume > 0:
+        assert layout_from_np.stride_order == layout.stride_order
+
+        if has_no_strides:
+            assert layout_from_np.is_contiguous_c
+            assert layout_from_np.is_contiguous_any
+            dense_layout = StridedLayout.dense(np_ref.shape, np_ref.itemsize)
+            assert layout_from_np.strides == dense_layout.strides
+            assert layout_from_np.strides_in_bytes == dense_layout.strides_in_bytes
+        else:
+            assert layout_from_np.strides == layout.strides
+            assert layout_from_np.strides_in_bytes == layout.strides_in_bytes
+
+
+def _check_envelope(layout: StridedLayout, layout_spec: LayoutSpec):
+    orignal_vol = math.prod(layout_spec.shape)
+    min_offset, max_offset = layout.offset_bounds
+    if layout.volume == 0:
+        assert min_offset == 0
+        assert max_offset == -1
+    else:
+        assert min_offset >= 0
+        assert min_offset <= max_offset
+        assert max_offset <= orignal_vol - 1
+        if layout.is_dense:
+            assert min_offset == 0
+            assert max_offset == math.prod(layout.shape) - 1
+        else:
+            shape, strides = layout.shape, layout.strides
+            ref_min_offset = ref_max_offset = layout.slice_offset
+            ref_min_offset += sum(strides[i] * (shape[i] - 1) for i in range(layout.ndim) if strides[i] < 0)
+            ref_max_offset += sum(strides[i] * (shape[i] - 1) for i in range(layout.ndim) if strides[i] > 0)
+            assert min_offset == ref_min_offset
+            assert max_offset == ref_max_offset
+    assert 0 <= layout.required_size_in_bytes() <= orignal_vol * layout_spec.itemsize
+    assert layout.required_size_in_bytes() == (max_offset + 1) * layout.itemsize
+
+
+def _cmp_slice_offset(
+    layout_0: StridedLayout,
+    layout_1: StridedLayout,
+    np_ref_0: np.ndarray,
+    np_ref_1: np.ndarray,
+):
+    # cannot access numpy's scalar data pointer
+    if layout_1.ndim > 0:
+        ref_offset = np_ref_1.ctypes.data - np_ref_0.ctypes.data
+        layout_offset = layout_1.slice_offset_in_bytes - layout_0.slice_offset_in_bytes
+        assert layout_offset == ref_offset
+
+
+@pytest.mark.parametrize(
+    "layout_spec",
+    [
+        LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order)
+        for shape in [tuple(), (5,), (7, 9), (2, 3, 4)]
+        for stride_order in random_permutations(py_rng, len(shape))
+    ],
+    ids=pretty_name,
+)
+def test_dense_with_permutation_as_stride_order(layout_spec):
+    """
+    Test creating StridedLayout with stride_order=tuple(...).
+    """
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, False)
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, False)
+    _check_envelope(layout, layout_spec)
+    assert layout.stride_order == layout_spec.stride_order
+
+
+@pytest.mark.parametrize(
+    "layout_spec",
+    [
+        LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order, perm=permutation)
+        for shape in [
+            tuple(),
+            (1,),
+            (2, 3),
+            (5, 6, 7),
+            (5, 1, 7),
+            (5, 2, 3, 4),
+            long_shape(py_rng, 64),
+        ]
+        for permutation in random_permutations(py_rng, len(shape), sample_size=3)
+        for stride_order in list(DenseOrder)
+    ],
+    ids=pretty_name,
+)
+def test_permuted(layout_spec):
+    """
+    Test creating StridedLayout with dense(C/F) order or implict C order
+    StridedLayout(strides=None) and calling permuted(perm) on it.
+    Tests against numpy transpose and checks stride_order attribute.
+    """
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _check_envelope(layout, layout_spec)
+    unit_dims_count = sum(dim == 1 for dim in np_ref.shape)
+    if unit_dims_count <= 1:
+        # stride order with multiple unit dimensions is not unique
+        # a simple equality check won't do
+        expected_order = inv_permutation(layout_spec.perm)
+        if layout_spec.np_order() == "F":
+            expected_order = tuple(reversed(expected_order))
+        assert layout.stride_order == expected_order
+
+
+class PermutedErr(Enum):
+    REPEATED_AXIS = "axis -?\\d+ appears multiple times"
+    OUT_OF_RANGE = "axis -?\\d+ out of range for"
+    WRONG_LEN = "the same length as the number of dimensions"
+
+
+@pytest.mark.parametrize(
+    ("layout_spec", "error_msg"),
+    [
+        (
+            LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order, perm=permutation),
+            error_msg,
+        )
+        for shape, permutation, error_msg in [
+            (tuple(), (5,), PermutedErr.WRONG_LEN),
+            ((1,), (0, 0), PermutedErr.WRONG_LEN),
+            ((2, 5, 3), (1, 0, 1), PermutedErr.REPEATED_AXIS),
+            ((5, 6, 7), (1, 3, 0), PermutedErr.OUT_OF_RANGE),
+            ((5, 6, 7), (1, -2000, 0), PermutedErr.OUT_OF_RANGE),
+        ]
+        for stride_order in list(DenseOrder)
+    ],
+    ids=pretty_name,
+)
+def test_permuted_validation(layout_spec, error_msg):
+    layout, _ = _setup_layout_and_np_ref(layout_spec)
+    with pytest.raises(ValueError, match=error_msg.value):
+        layout.permuted(layout_spec.perm)
+
+
+class SliceErr(Enum):
+    ZERO_STEP = "slice step cannot be zer"
+    TOO_MANY_SLICES = "is greater than the number of dimensions"
+    OUT_OF_RANGE = "out of range for axis"
+    TYPE_ERROR = "Expected slice instance or integer."
+
+
+@pytest.mark.parametrize(
+    ("layout_spec", "error_msg"),
+    [
+        (
+            LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order, slices=slices),
+            error_msg,
+        )
+        for shape, slices, error_msg in [
+            (tuple(), tuple(), None),
+            ((12,), _S[:], None),
+            ((13,), _S[::-1], None),
+            ((13,), [_S[::-1], _S[::-1]], None),
+            ((13,), [_S[::-1], _S[1:-1], _S[::-1]], None),
+            ((13,), _S[2:-3], None),
+            ((13,), _S[2:-3:2], None),
+            ((13,), _S[-3:2:-2], None),
+            ((13,), [_S[-3:2:-2], _S[1:3]], None),
+            ((3, 5), [_S[:2], _S[:, 3:]], None),
+            ((3, 5), _S[5:4], None),
+            ((3, 5), _S[:, ::0], SliceErr.ZERO_STEP),
+            ((3, 5), _S[:, :-1, :2], SliceErr.TOO_MANY_SLICES),
+            ((11, 12, 3), _S[:, 0, :-1], None),
+            ((11, 12, 3), _S[0, 1, :-1], None),
+            ((11, 12, 3, 5), [_S[0], _S[1]], None),
+            ((11, 12, 3, 5), _S[:, 1, :-1], None),
+            ((11, 12, 3), _S[0, 1, 2], None),
+            ((11, 12, 3), _S[0, 1, 5], SliceErr.OUT_OF_RANGE),
+            ((11, 12, 3), _S[-2], None),
+            ((11, 12, 3), _S[-42], SliceErr.OUT_OF_RANGE),
+            ((11, 12, 3), ["abc"], SliceErr.TYPE_ERROR),
+            (long_shape(py_rng, 64), (slice(None, None, -1),) * 64, None),
+        ]
+        for stride_order in list(DenseOrder)
+    ],
+    ids=pretty_name,
+)
+def test_slice(layout_spec, error_msg):
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+
+    if error_msg is None:
+        for sl in layout_spec.slices:
+            sliced = layout[sl]
+            sliced_ref = np_ref[sl]
+            _cmp_layout_and_array(sliced, sliced_ref, False)
+            _cmp_layout_from_dense_vs_from_np(sliced, sliced_ref, False)
+            _cmp_slice_offset(layout, sliced, np_ref, sliced_ref)
+            _check_envelope(sliced, layout_spec)
+            layout = sliced
+            np_ref = sliced_ref
+    else:
+        error_cls = TypeError if error_msg == SliceErr.TYPE_ERROR else ValueError
+        with pytest.raises(error_cls, match=error_msg.value):
+            for sl in layout_spec.slices:
+                layout[sl]
+
+
+class ReshapeErr(Enum):
+    VOLUME_MISMATCH = "The original volume \\d+ and the new volume \\d+ must be equal."
+    NEG_EXTENT = "Extents must be non-negative"
+    MULTI_NEG_EXTENTS = "There can be at most one -1 extent in a shape"
+    AMBIGUOUS_NEG_EXTENT = "The -1 extent is ambiguous when the specified sub-volume is 0"
+    DIVISIBILITY_VIOLATION = "The original volume \\d+ must be divisible by the specified sub-volume \\d+"
+    STRIDE = "Layout strides are incompatible with the new shape"
+    TYPE_ERROR = None
+
+
+@pytest.mark.parametrize(
+    ("layout_spec", "new_shape", "error_msg"),
+    [
+        (
+            LayoutSpec(
+                shape,
+                py_rng.choice(_ITEMSIZES),
+                stride_order,
+                perm=permutation,
+                slices=slices,
+            ),
+            NamedParam("new_shape", new_shape),
+            error_msg,
+        )
+        for shape, permutation, slices, new_shape, error_msg in [
+            (tuple(), None, None, tuple(), None),
+            (tuple(), None, None, (1,), None),
+            (tuple(), None, None, (-1,), None),
+            (tuple(), None, None, (1, -1, 1), None),
+            ((1,), None, None, (-1,), None),
+            ((1,), None, None, tuple(), None),
+            ((12,), None, _S[:], (12,), None),
+            ((12,), None, None, (11,), ReshapeErr.VOLUME_MISMATCH),
+            ((12,), None, _S[1:], (11,), None),
+            ((0,), None, None, (0,), None),
+            ((0,), None, None, (1, 3), ReshapeErr.VOLUME_MISMATCH),
+            ((3,), None, _S[3:], (3,), ReshapeErr.VOLUME_MISMATCH),
+            ((18,), None, None, (0,), ReshapeErr.VOLUME_MISMATCH),
+            ((3,), None, _S[2:-1], (0,), None),
+            ((3,), None, _S[3:], (-1,), None),
+            ((0,), None, None, (1, -1), None),
+            ((0,), None, None, (0, -1), ReshapeErr.AMBIGUOUS_NEG_EXTENT),
+            ((3, 0, 3), None, None, (2, 3, 4, 5, 6, 7, 0, 12), None),
+            ((3, 0, 3), None, None, (0,), None),
+            ((12,), None, None, (2, 3, 2), None),
+            ((12,), None, None, (2, 6), None),
+            ((12,), None, None, (4, 3), None),
+            ((12,), None, None, (3, 4), None),
+            ((7, 12), None, None, (7, 12), None),
+            ((7, 12), None, None, (12, 7), None),
+            ((12, 11), None, None, (2, 3, 2, 11), None),
+            ((12, 11), None, None, (2, 3, 11, 2), None),
+            ((12, 11), None, None, (2, 11, 3, 2), None),
+            ((12, 11), None, None, (11, 2, 3, 2), None),
+            ((12, 11), None, None, (2, 3, 2, -1), None),
+            ((12, 11), None, None, (2, 3, -1, 2), None),
+            ((12, 11), None, None, (2, -1, 3, 2), None),
+            ((12, 11), None, None, (-1, 2, 3, 2), None),
+            ((12, 11), None, None, (2, 3, -1, 11), None),
+            ((12, 11), None, None, (2, 3, 11, -1), None),
+            ((12, 11), None, None, (-1, 11, 3, 2), None),
+            ((12, 11), None, None, (11, 2, -1, 2), None),
+            ((5, 12), None, None, (2, 5, 6), None),
+            ((2, 3, 2), None, None, (12,), None),
+            ((2, 3, 2), None, None, (6, 2), None),
+            ((2, 3, 2), None, None, (2, 3, 2), None),
+            ((2, 3, 2), (1, 2, 0), None, (6, 2), None),
+            ((2, 3, 2), (1, 2, 0), None, (2, 6), ReshapeErr.STRIDE),
+            ((2, 3, 2), (1, 2, 0), None, (12,), ReshapeErr.STRIDE),
+            ((2, 3, 2), (1, 0, 2), None, (3, 2, 2), None),
+            ((2, 3, 2), (1, 0, 2), None, (3, 4), ReshapeErr.STRIDE),
+            ((2, 3, 2), (1, 0, 2), None, (6, 2), ReshapeErr.STRIDE),
+            ((2, 3, 2), (1, 0, 2), None, (12,), ReshapeErr.STRIDE),
+            ((10, 10, 10), None, _S[::-1, ::-1, :], (10, 10, 10), None),
+            ((10, 10, 10), None, _S[::-1, ::-1, ::-1], (1000,), None),
+            ((10, 10, 10), None, _S[::-1, ::-1, :], (100, 10), None),
+            ((10, 10, 10), None, _S[::-1, ::-1, :], (10, 100), ReshapeErr.STRIDE),
+            ((10, 10, 10), None, _S[:, :, ::-1], (100, 10), None),
+            ((10, 10, 10), None, _S[:, :, ::-1], (10, 100), ReshapeErr.STRIDE),
+            ((10, 10, 10), None, _S[::-1, :, ::-1], (1000,), ReshapeErr.STRIDE),
+            ((10, 10, 10), (1, 0, 2), _S[::-1, ::-1], (100, 10), ReshapeErr.STRIDE),
+            ((5, 3), None, _S[:-1, :], (12,), None),
+            ((13, 3), None, _S[1:, :], (6, 6), None),
+            ((12, 4), None, _S[:, :-1], (6, 6), ReshapeErr.STRIDE),
+            ((12, 4), None, _S[:, :-1], (6, 2, 3), None),
+            ((7, 6, 5), None, None, (70, -1), None),
+            ((7, 6, 5), None, None, (-1, 70), None),
+            ((7, 6, 5), None, None, (71, -1), ReshapeErr.DIVISIBILITY_VIOLATION),
+            ((7, 6, 5), None, None, (-1, 71), ReshapeErr.DIVISIBILITY_VIOLATION),
+            ((7, 6, 5), None, None, (71, -2), ReshapeErr.NEG_EXTENT),
+            ((7, 6, 5), None, None, (-2, 71), ReshapeErr.NEG_EXTENT),
+            ((7, 6, 5), None, None, (-1, 6, -1), ReshapeErr.MULTI_NEG_EXTENTS),
+            ((7, 6, 5), None, None, (-2, -1, -1), ReshapeErr.NEG_EXTENT),
+            ((7, 6, 5), None, None, (-2, -1, -2), ReshapeErr.NEG_EXTENT),
+            ((7, 6, 5), None, None, (-7, 6, -5), ReshapeErr.NEG_EXTENT),
+            ((7, 6, 5), None, None, (5, 0, -1), ReshapeErr.AMBIGUOUS_NEG_EXTENT),
+            ((7, 0, 5), None, None, (5, 0, -1), ReshapeErr.AMBIGUOUS_NEG_EXTENT),
+            ((7, 6, 5), None, None, map, ReshapeErr.TYPE_ERROR),
+            # random 64-dim shape with 5 non-unit extents 2, 3, 4, 5, 6
+            (long_shape(py_rng, 64, 5, 6), None, None, (60, 12), None),
+        ]
+        for stride_order in [DenseOrder.C, DenseOrder.IMPLICIT_C]
+    ],
+    ids=pretty_name,
+)
+def test_reshape(layout_spec, new_shape, error_msg):
+    new_shape = new_shape.value
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    if error_msg is None:
+        reshaped = layout.reshaped(new_shape)
+        reshaped_ref = np_ref.reshape(new_shape, copy=False)
+        _cmp_layout_and_array(reshaped, reshaped_ref, False)
+        _cmp_layout_from_dense_vs_from_np(reshaped, reshaped_ref, False)
+        _check_envelope(reshaped, layout_spec)
+    else:
+        # sanity check that numpy is not able to reshape without
+        # a copy as well
+        if error_msg == ReshapeErr.STRIDE:
+            with pytest.raises(ValueError):
+                np_ref.reshape(new_shape, copy=False)
+
+        error_cls = TypeError if error_msg == ReshapeErr.TYPE_ERROR else ValueError
+        msg = None if error_msg == ReshapeErr.TYPE_ERROR else error_msg.value
+        with pytest.raises(error_cls, match=msg):
+            layout.reshaped(new_shape)
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec",
+        "expected_shape",
+        "expected_strides",
+        "expected_axis_mask",
+    ),
+    [
+        (
+            LayoutSpec(
+                shape,
+                py_rng.choice(_ITEMSIZES),
+                stride_order,
+                perm=permutation,
+                slices=slices,
+            ),
+            NamedParam("expected_shape", expected_shape),
+            NamedParam("expected_strides", expected_strides),
+            NamedParam("expected_axis_mask", expected_axis_mask),
+        )
+        for shape, permutation, slices, expected_shape, expected_strides, expected_axis_mask in [
+            (tuple(), None, None, (1,), (1,), ""),
+            ((12,), None, _S[:], (12,), (1,), "0"),
+            ((1, 2, 3, 4, 5), None, None, (120,), (1,), "01111"),
+            ((1, 2, 3, 0, 5), None, None, (0,), (0,), "01111"),
+            ((5, 1, 2, 4, 3), None, _S[:, :, :, :, ::-2], (40, 2), (3, -2), "01110"),
+            ((5, 2, 4, 3), None, _S[:, ::-1, :, :], (5, 2, 12), (24, -12, 1), "0001"),
+            ((5, 7, 4, 3), None, _S[:, ::-1, ::-1], (5, 28, 3), (84, -3, 1), "0010"),
+            ((5, 4, 3, 7), (2, 3, 0, 1), _S[:], (21, 20), (1, 21), "0101"),
+            ((5, 4, 3, 7), (3, 2, 0, 1), None, (7, 3, 20), (1, 7, 21), "0001"),
+            # random 64-dim shape with 4 non-unit extents 2, 3, 4, 5
+            (long_shape(py_rng, 64, 4, 5), None, None, (120,), (1,), "0" + "1" * 63),
+        ]
+        for stride_order in [DenseOrder.C, DenseOrder.IMPLICIT_C]
+    ],
+    ids=pretty_name,
+)
+def test_flatten(
+    layout_spec,
+    expected_shape,
+    expected_strides,
+    expected_axis_mask,
+):
+    expected_shape = expected_shape.value
+    expected_strides = expected_strides.value
+    expected_axis_mask = expected_axis_mask.value
+
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _check_envelope(layout, layout_spec)
+
+    mask = flatten_mask2str(layout.flattened_axis_mask(), layout.ndim)
+    assert mask == expected_axis_mask
+
+    flattened = layout.flattened()
+    assert flattened.shape == expected_shape
+    assert flattened.strides == expected_strides
+    assert flattened.itemsize == layout_spec.itemsize
+    assert flattened.slice_offset == layout.slice_offset
+
+    # cannot be flattened any further
+    assert flattened.flattened_axis_mask() == 0
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec_0",
+        "layout_spec_1",
+        "expected_layout_spec_0",
+        "expected_layout_spec_1",
+    ),
+    [
+        (
+            layout_spec_0,
+            layout_spec_1,
+            expected_layout_spec_0,
+            expected_layout_spec_1,
+        )
+        for layout_spec_0, layout_spec_1, expected_layout_spec_0, expected_layout_spec_1 in [
+            (
+                LayoutSpec(tuple(), 2, DenseOrder.C),
+                LayoutSpec(tuple(), 4, DenseOrder.C),
+                LayoutSpec((1,), 2, DenseOrder.C),
+                LayoutSpec((1,), 4, DenseOrder.C),
+            ),
+            (
+                LayoutSpec(tuple(), 2, DenseOrder.IMPLICIT_C),
+                LayoutSpec(tuple(), 4, DenseOrder.IMPLICIT_C),
+                LayoutSpec((1,), 2, DenseOrder.C),
+                LayoutSpec((1,), 4, DenseOrder.C),
+            ),
+            (
+                LayoutSpec((2, 7, 13, 5), 8, DenseOrder.C),
+                LayoutSpec((3, 5, 11, 1), 4, DenseOrder.C),
+                LayoutSpec((910,), 8, DenseOrder.C),
+                LayoutSpec((165,), 4, DenseOrder.C),
+            ),
+            (
+                LayoutSpec((2, 7, 13, 5), 8, DenseOrder.IMPLICIT_C),
+                LayoutSpec((3, 5, 11, 1), 4, DenseOrder.IMPLICIT_C),
+                LayoutSpec((910,), 8, DenseOrder.C),
+                LayoutSpec((165,), 4, DenseOrder.C),
+            ),
+            (
+                LayoutSpec((5, 7, 13, 2), 4, (3, 1, 2, 0)),
+                LayoutSpec((3, 5, 11, 1), 2, DenseOrder.IMPLICIT_C),
+                LayoutSpec((5, 91, 2), 4, (2, 1, 0)),
+                LayoutSpec((3, 55, 1), 2, DenseOrder.C),
+            ),
+            (
+                LayoutSpec((2, 7, 13, 5), 16, DenseOrder.C),
+                LayoutSpec((11, 1, 3, 5), 1, (2, 3, 0, 1)),
+                LayoutSpec((14, 65), 16, DenseOrder.C),
+                LayoutSpec((11, 15), 1, (1, 0)),
+            ),
+            (
+                LayoutSpec(
+                    (4, 5, 11, 2, 3, 7),
+                    4,
+                    (5, 3, 4, 0, 1, 2),
+                ),
+                LayoutSpec(
+                    (3, 8, 5, 6, 7, 9),
+                    4,
+                    (0, 1, 3, 4, 5, 2),
+                ),
+                LayoutSpec((20, 11, 6, 7), 4, (3, 2, 0, 1)),
+                LayoutSpec((24, 5, 42, 9), 4, (0, 2, 3, 1)),
+            ),
+        ]
+    ],
+    ids=pretty_name,
+)
+def test_flatten_together(
+    layout_spec_0,
+    layout_spec_1,
+    expected_layout_spec_0,
+    expected_layout_spec_1,
+):
+    layouts = []
+    for layout_spec in [
+        layout_spec_0,
+        layout_spec_1,
+        expected_layout_spec_0,
+        expected_layout_spec_1,
+    ]:
+        layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+        layout, np_ref = _transform(layout, np_ref, layout_spec)
+        _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+        _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+        layouts.append(layout)
+
+    layout_0, layout_1, expected_layout_0, expected_layout_1 = layouts
+
+    mask_0 = layout_0.flattened_axis_mask()
+    mask_1 = layout_1.flattened_axis_mask()
+    mask = mask_0 & mask_1
+
+    flattened_0 = layout_0.flattened(mask=mask)
+    flattened_1 = layout_1.flattened(mask=mask)
+    _check_envelope(flattened_0, layout_spec_0)
+    _check_envelope(flattened_1, layout_spec_1)
+
+    for flattened, expected_layout in zip([flattened_0, flattened_1], [expected_layout_0, expected_layout_1]):
+        assert flattened == expected_layout
+        assert flattened.shape == expected_layout.shape
+        assert flattened.strides == expected_layout.strides
+        assert flattened.itemsize == expected_layout.itemsize
+        assert flattened.slice_offset == expected_layout.slice_offset
+        assert flattened.is_contiguous_c == expected_layout.is_contiguous_c
+        assert flattened.is_contiguous_f == expected_layout.is_contiguous_f
+        assert flattened.is_contiguous_any == expected_layout.is_contiguous_any
+        assert flattened.is_unique == expected_layout.is_unique
+
+
+@pytest.mark.parametrize(
+    ("layout_spec",),
+    [
+        (
+            LayoutSpec(
+                shape,
+                py_rng.choice(_ITEMSIZES),
+                stride_order,
+                perm=permutation,
+                slices=slices,
+            ),
+        )
+        for shape, permutation, slices in [
+            (tuple(), None, None),
+            ((12,), None, None),
+            ((1, 5, 4, 3), None, None),
+            ((1, 5, 1, 4, 3), None, _S[:, -1:, :]),
+            ((1, 5, 4, 3), None, _S[:, -1:, :1, 1:2]),
+            ((7, 5, 3), (2, 0, 1), _S[::-1, 3:2:-1, :]),
+            ((7, 5, 3), (2, 0, 1), _S[:, 3:2, :]),
+            (long_shape(py_rng, 64, 1), None, None),
+            (long_shape(py_rng, 33, 3), None, None),
+        ]
+        for stride_order in list(DenseOrder)
+    ],
+    ids=pretty_name,
+)
+def test_squeezed(layout_spec):
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    squeezed = layout.squeezed()
+    squeezed_ref = np_ref.squeeze()
+    if math.prod(np_ref.shape) != 0:
+        _cmp_layout_and_array(squeezed, squeezed_ref, False)
+        _cmp_layout_from_dense_vs_from_np(squeezed, squeezed_ref, False)
+    else:
+        assert squeezed.shape == (0,)
+        assert squeezed.strides == (0,)
+    assert squeezed.slice_offset == layout.slice_offset
+    _check_envelope(squeezed, layout_spec)
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec",
+        "axes",
+    ),
+    [
+        (
+            LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order, slices=slices),
+            NamedParam("axes", axes),
+        )
+        for shape, slices in [
+            (tuple(), None),
+            ((7,), None),
+            ((4, 5, 7, 11), _S[1:-1, ::-1, 2:-1, ::3]),
+        ]
+        for stride_order in list(DenseOrder)
+        for num_axes in range(3)
+        for axes in itertools.combinations(list(range(len(shape) + num_axes)), num_axes)
+    ],
+    ids=pretty_name,
+)
+def test_unsqueezed_layout(layout_spec, axes):
+    axes = tuple(axes.value)
+
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    unsqueezed = layout.unsqueezed(axes)
+    unsqueezed_ref = np.expand_dims(np_ref, axis=axes)
+    # the implicit C layout is kept if the original layout has such strides
+    # and there are no actual transformations along the way: no slices
+    # and unsqueezing with empty axes tuple
+    has_no_strides = layout_spec.has_no_strides_transformed() and len(axes) == 0
+    _cmp_layout_and_array(unsqueezed, unsqueezed_ref, has_no_strides)
+    _cmp_layout_from_dense_vs_from_np(unsqueezed, unsqueezed_ref, has_no_strides)
+    _check_envelope(unsqueezed, layout_spec)
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec",
+        "axis",
+        "expected_max_itemsize",
+        "new_itemsize",
+    ),
+    [
+        (
+            LayoutSpec(shape, itemsize, stride_order, perm=permutation, slices=slices),
+            NamedParam("axis", axis),
+            NamedParam("expected_max_itemsize", expected_max_itemsize),
+            NamedParam("new_itemsize", new_itemsize),
+        )
+        for shape, permutation, slices, stride_order, itemsize, axis, expected_max_itemsize, new_itemsize in [
+            ((12,), None, None, DenseOrder.C, 1, -1, 4, 1),
+            ((12,), None, None, DenseOrder.IMPLICIT_C, 1, -1, 4, 1),
+            ((12,), None, None, DenseOrder.F, 1, 0, 4, 1),
+            ((12,), None, None, DenseOrder.C, 4, -1, 16, 8),
+            ((12,), None, None, DenseOrder.IMPLICIT_C, 4, -1, 16, 8),
+            ((12,), None, None, DenseOrder.F, 4, 0, 16, 8),
+            ((16, 5, 4, 6), None, None, DenseOrder.C, 2, -1, 4, 4),
+            ((16, 5, 4, 6), None, None, DenseOrder.IMPLICIT_C, 2, -1, 4, 4),
+            ((16, 5, 4, 6), None, None, DenseOrder.F, 2, 0, 16, 4),
+            ((11, 5, 9), None, _S[:, :, -1:], DenseOrder.C, 2, 2, 2, 2),
+            ((11, 5, 9), None, _S[:, :, -1:], DenseOrder.IMPLICIT_C, 2, 2, 2, 2),
+            ((11, 5, 9), None, _S[:, :, -1:], DenseOrder.F, 2, 0, 2, 2),
+            ((12, 3, 24), (1, 2, 0), _S[::-1, 20:, 1:], DenseOrder.C, 2, 1, 8, 8),
+            ((12, 3, 24), (1, 2, 0), _S[1:, ::-1, 10:], DenseOrder.F, 2, 2, 4, 4),
+            ((1, 3) + (1,) * 61 + (4,), None, None, DenseOrder.C, 2, -1, 8, 8),
+            ((1, 3) + (1,) * 61 + (4,), None, None, DenseOrder.IMPLICIT_C, 2, -1, 8, 4),
+            ((4, 3) + (1,) * 61 + (3,), None, None, DenseOrder.F, 2, 0, 8, 4),
+        ]
+    ],
+    ids=pretty_name,
+)
+def test_packed_unpacked(
+    layout_spec,
+    axis,
+    expected_max_itemsize,
+    new_itemsize,
+):
+    axis = axis.value
+    expected_max_itemsize = expected_max_itemsize.value
+    new_itemsize = new_itemsize.value
+
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    assert layout.max_compatible_itemsize(axis=axis) == expected_max_itemsize
+    packed = layout.repacked(new_itemsize, axis=axis)
+    # numpy does not allow specifying the axis to repack,
+    # so we need to transpose the array
+    packed_ref = (
+        np_ref.transpose(layout.stride_order)
+        .view(dtype=dtype_from_itemsize(new_itemsize))
+        .transpose(inv_permutation(layout.stride_order))
+    )
+    has_no_strides = layout_spec.has_no_strides_transformed() and layout.itemsize == new_itemsize
+    _cmp_layout_and_array(packed, packed_ref, has_no_strides)
+    _cmp_layout_from_dense_vs_from_np(packed, packed_ref, has_no_strides)
+    _check_envelope(packed, layout_spec)
+    vec_size = new_itemsize // layout.itemsize
+    assert packed.slice_offset * vec_size == layout.slice_offset
+    unpacked = packed.repacked(layout.itemsize, axis=axis)
+    _cmp_layout_and_array(unpacked, np_ref, has_no_strides)
+    _cmp_layout_from_dense_vs_from_np(unpacked, np_ref, has_no_strides)
+    _check_envelope(unpacked, layout_spec)
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec",
+        "new_shape",
+    ),
+    [
+        (
+            LayoutSpec(shape, py_rng.choice(_ITEMSIZES), stride_order, slices=slices),
+            NamedParam("new_shape", new_shape),
+        )
+        for shape, slices, new_shape in [
+            (tuple(), None, tuple()),
+            (tuple(), None, (1,)),
+            (tuple(), None, (17, 1, 5)),
+            ((1,), None, (5,)),
+            ((1,), None, (3, 5, 2)),
+            ((7,), None, (7,)),
+            ((7,), None, (2, 7)),
+            ((5, 11), _S[1:-1, ::-1], (3, 11)),
+            ((5, 11), _S[1:-1, ::-1], (7, 3, 11)),
+            ((5, 11), _S[::-1, 3:4], (5, 7)),
+            ((5, 11), _S[::-1, 3:4], (5, 30)),
+            ((5, 11), _S[::-1, 3:4], (4, 5, 12)),
+            ((5, 11), _S[-1:,], (4, 13, 11)),
+            ((2, 3, 3), _S[:, 1:2], (401, 3) + (1,) * 59 + (2, 4, 3)),
+        ]
+        for stride_order in list(DenseOrder)
+    ],
+    ids=pretty_name,
+)
+def test_broadcast_layout(
+    layout_spec,
+    new_shape,
+):
+    new_shape = new_shape.value
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    broadcasted = layout.broadcast_to(new_shape)
+    broadcasted_ref = np.broadcast_to(np_ref, new_shape)
+    _cmp_layout_and_array(broadcasted, broadcasted_ref, False)
+    _cmp_layout_from_dense_vs_from_np(broadcasted, broadcasted_ref, False)
+    _check_envelope(broadcasted, layout_spec)
+    assert layout.is_unique
+    ndim_diff = len(broadcasted_ref.shape) - len(np_ref.shape)
+    expect_unique = all(broadcasted_ref.shape[i] == 1 for i in range(ndim_diff))
+    expect_unique = expect_unique and all(
+        broadcasted_ref.shape[i + ndim_diff] == np_ref.shape[i] for i in range(len(np_ref.shape))
+    )
+    assert broadcasted.is_unique is expect_unique
+
+
+@pytest.mark.parametrize(
+    (
+        "layout_spec",
+        "new_stride_order",
+    ),
+    [
+        (
+            LayoutSpec(
+                shape,
+                py_rng.choice(_ITEMSIZES),
+                stride_order,
+                perm=permutation,
+                slices=slices,
+            ),
+            NamedParam("new_stride_order", new_stride_order),
+        )
+        for shape, permutation, slices in [
+            (tuple(), None, None),
+            ((1,), None, None),
+            ((7,), None, None),
+            ((7,), None, _S[3:6]),
+            ((7,), None, _S[::-1]),
+            ((5, 11), None, None),
+            ((5, 11), None, _S[1:-1]),
+            ((5, 11), None, _S[::-1, 3:10]),
+            ((5, 11), None, _S[1:4, ::-1]),
+            ((5, 11), None, _S[-1:,]),
+            ((3, 5, 7), (1, 0, 2), None),
+        ]
+        for stride_order in list(DenseOrder)
+        for new_stride_order in ["C", "F", "K"] + random_permutations(py_rng, len(shape))
+    ],
+    ids=pretty_name,
+)
+def test_to_dense(layout_spec, new_stride_order):
+    new_stride_order = new_stride_order.value
+
+    layout, np_ref = _setup_layout_and_np_ref(layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides())
+    layout, np_ref = _transform(layout, np_ref, layout_spec)
+    _cmp_layout_and_array(layout, np_ref, layout_spec.has_no_strides_transformed())
+    _cmp_layout_from_dense_vs_from_np(layout, np_ref, layout_spec.has_no_strides_transformed())
+
+    if isinstance(new_stride_order, str):
+        if new_stride_order == "K":
+            is_noop = layout.slice_offset == 0 and layout.is_contiguous_any
+        elif new_stride_order == "C":
+            is_noop = layout.slice_offset == 0 and layout.is_contiguous_c
+        elif new_stride_order == "F":
+            is_noop = layout.slice_offset == 0 and layout.is_contiguous_f
+        else:
+            raise AssertionError(f"Invalid new_stride_order: {new_stride_order}")
+        has_no_strides = layout_spec.has_no_strides_transformed() and is_noop
+        dense = layout.to_dense(new_stride_order)
+        dense_ref = np_ref.copy(order=new_stride_order)
+        _cmp_layout_and_array(dense, dense_ref, has_no_strides)
+        _cmp_layout_from_dense_vs_from_np(dense, dense_ref, has_no_strides)
+    else:
+        assert isinstance(new_stride_order, tuple)
+        assert len(new_stride_order) == len(layout.shape)
+        dense = layout.to_dense(new_stride_order)
+        dense_ref = np_ref.transpose(new_stride_order).copy(order="C").transpose(inv_permutation(new_stride_order))
+        _cmp_layout_and_array(dense, dense_ref, False)
+        _cmp_layout_from_dense_vs_from_np(dense, dense_ref, False)
+
+    assert dense.is_dense
+    assert dense.required_size_in_bytes() == np_ref.size * layout.itemsize
+    assert dense.offset_bounds == (0, np_ref.size - 1)
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 3580507250..9d72b98af0 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import math
+
 try:
     import cupy as cp
 except ImportError:
@@ -15,7 +17,7 @@
 import pytest
 from cuda.core.experimental import Device
 from cuda.core.experimental._memoryview import view_as_cai
-from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core.experimental.utils import StridedLayout, StridedMemoryView, args_viewable_as_strided_memory
 
 
 def test_cast_to_3_tuple_success():
@@ -195,3 +197,134 @@ def _check_view(self, view, in_arr, dev):
         assert view.device_id == dev.device_id
         assert view.is_device_accessible is True
         assert view.exporting_obj is in_arr
+
+
+def _dense_strides(shape, stride_order):
+    ndim = len(shape)
+    strides = [None] * ndim
+    if ndim > 0:
+        if stride_order == "C":
+            strides[-1] = 1
+            for i in range(ndim - 2, -1, -1):
+                strides[i] = strides[i + 1] * shape[i + 1]
+        else:
+            assert stride_order == "F"
+            strides[0] = 1
+            for i in range(1, ndim):
+                strides[i] = strides[i - 1] * shape[i - 1]
+    return tuple(strides)
+
+
+@pytest.mark.parametrize("shape", [tuple(), (2, 3), (10, 10), (10, 13, 11)])
+@pytest.mark.parametrize("itemsize", [1, 4])
+@pytest.mark.parametrize("stride_order", ["C", "F"])
+@pytest.mark.parametrize("readonly", [True, False])
+def test_from_buffer(shape, itemsize, stride_order, readonly):
+    dev = Device()
+    dev.set_current()
+    layout = StridedLayout.dense(shape=shape, itemsize=itemsize, stride_order=stride_order)
+    required_size = layout.required_size_in_bytes()
+    assert required_size == math.prod(shape) * itemsize
+    buffer = dev.memory_resource.allocate(required_size)
+    view = StridedMemoryView.from_buffer(buffer, layout, is_readonly=readonly)
+    assert view.exporting_obj is buffer
+    assert view.layout is layout
+    assert view.ptr == int(buffer.handle)
+    assert view.shape == shape
+    assert view.strides == _dense_strides(shape, stride_order)
+    assert view.dtype is None
+    assert view.device_id == dev.device_id
+    assert view.is_device_accessible
+    assert view.readonly == readonly
+
+
+@pytest.mark.parametrize("stride_order", ["C", "F"])
+def test_from_buffer_sliced(stride_order):
+    layout = StridedLayout.dense((5, 7), 2, stride_order=stride_order)
+    device = Device()
+    device.set_current()
+    buffer = device.memory_resource.allocate(layout.required_size_in_bytes())
+    view = StridedMemoryView.from_buffer(buffer, layout)
+    assert view.shape == (5, 7)
+
+    sliced_view = view.view(layout[:-2, 3:])
+    assert sliced_view.shape == (3, 4)
+    expected_offset = 3 if stride_order == "C" else 3 * 5
+    assert sliced_view.layout.slice_offset == expected_offset
+    assert sliced_view.layout.slice_offset_in_bytes == expected_offset * 2
+    assert sliced_view.ptr == view.ptr + expected_offset * 2
+
+
+def test_from_buffer_too_small():
+    layout = StridedLayout.dense((5, 4), 2)
+    d = Device()
+    d.set_current()
+    buffer = d.memory_resource.allocate(20)
+    with pytest.raises(ValueError, match="Expected at least 40 bytes, got 20 bytes."):
+        StridedMemoryView.from_buffer(buffer, layout)
+
+
+def test_from_buffer_disallowed_negative_offset():
+    layout = StridedLayout((5, 4), (-4, 1), 1)
+    d = Device()
+    d.set_current()
+    buffer = d.memory_resource.allocate(20)
+    with pytest.raises(ValueError, match="please use StridedLayout.to_dense()."):
+        StridedMemoryView.from_buffer(buffer, layout)
+
+
+class _EnforceCAIView:
+    def __init__(self, array):
+        self.array = array
+        self.__cuda_array_interface__ = array.__cuda_array_interface__
+
+
+def _get_ptr(array):
+    if isinstance(array, np.ndarray):
+        return array.ctypes.data
+    else:
+        assert isinstance(array, cp.ndarray)
+        return array.data.ptr
+
+
+@pytest.mark.parametrize(
+    ("shape", "slices", "stride_order", "view_as"),
+    [
+        (shape, slices, stride_order, view_as)
+        for shape, slices in [
+            ((5, 6), (2, slice(1, -1))),
+            ((10, 13, 11), (slice(None, None, 2), slice(None, None, -1), slice(2, -3))),
+        ]
+        for stride_order in ["C", "F"]
+        for view_as in ["dlpack", "cai"]
+    ],
+)
+def test_from_buffer_sliced_external(shape, slices, stride_order, view_as):
+    if view_as == "dlpack":
+        if np is None:
+            pytest.skip("NumPy is not installed")
+        a = np.arange(math.prod(shape), dtype=np.int32).reshape(shape, order=stride_order)
+        view = StridedMemoryView(a, -1)
+    else:
+        if cp is None:
+            pytest.skip("CuPy is not installed")
+        a = cp.arange(math.prod(shape), dtype=cp.int32).reshape(shape, order=stride_order)
+        view = StridedMemoryView(_EnforceCAIView(a), -1)
+    layout = view.layout
+    assert layout.is_dense
+    assert layout.required_size_in_bytes() == a.nbytes
+    assert view.ptr == _get_ptr(a)
+
+    sliced_layout = layout[slices]
+    sliced_view = view.view(sliced_layout)
+    a_sliced = a[slices]
+    assert sliced_view.ptr == _get_ptr(a_sliced)
+    assert sliced_view.ptr != view.ptr
+
+    assert 0 <= sliced_layout.required_size_in_bytes() <= a.nbytes
+    assert not sliced_layout.is_dense
+    assert sliced_view.layout is sliced_layout
+    assert view.dtype == sliced_view.dtype
+    assert sliced_view.layout.itemsize == a_sliced.itemsize == layout.itemsize
+    assert sliced_view.shape == a_sliced.shape
+    assert sliced_view.layout.strides_in_bytes == a_sliced.strides