From f17aabb5b3b4fc37504acd006190d00fb58df95e Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 25 Feb 2026 13:32:03 -0600
Subject: [PATCH 1/8] working on instabilities

---
 examples/custom_cuda.jl       |  4 +--
 src/ndarray/detail/ndarray.jl | 23 ++++++------
 src/ndarray/ndarray.jl        | 68 ++++++++++++++++++++---------------
 3 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/examples/custom_cuda.jl b/examples/custom_cuda.jl
index 7c6d8511..18675c4f 100644
--- a/examples/custom_cuda.jl
+++ b/examples/custom_cuda.jl
@@ -24,8 +24,8 @@ N = 1024
 threads = 256
 blocks = cld(N, threads)
 
-a = cuNumeric.full(N, 1.0f0)
-b = cuNumeric.full(N, 2.0f0)
+a = cuNumeric.fill(1.0f0, N)
+b = cuNumeric.fill(2.0f0, N)
 c = cuNumeric.ones(Float32, N)
 
 # task = cuNumeric.@cuda_task kernel_add(a, b, c, UInt32(1))
diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 5ecc10ea..f124bc80 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -77,28 +77,28 @@ end
 #     return NDArray(ptr, T = T, n_dim = 1)
 # end
 
-NDArray(value::T) where {T<:SUPPORTED_TYPES} = nda_full_array(UInt64[], value)
+NDArray(value::T) where {T<:SUPPORTED_TYPES} = nda_full_array((), value)
 
 # construction 
-function nda_zeros_array(shape::Vector{UInt64}, ::Type{T}) where {T}
-    n_dim = Int32(length(shape))
+function nda_zeros_array(dims::Dims{N}, ::Type{T}) where {T, N}
+    shape = collect(UInt64, dims)
     legate_type = Legate.to_legate_type(T)
     ptr = ccall((:nda_zeros_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated),
-        n_dim, shape, legate_type)
-    return NDArray(ptr; T=T, n_dim=n_dim)
+        Int32(N), shape, legate_type)
+    return NDArray(ptr; T=T, n_dim=N)
 end
 
-function nda_full_array(shape::Vector{UInt64}, value::T) where {T}
-    n_dim = Int32(length(shape))
+function nda_full_array(dims::Dims{N}, value::T) where {T, N}
+    shape = collect(UInt64, dims)
     type = Legate.to_legate_type(T)
 
     ptr = ccall((:nda_full_array, libnda),
         NDArray_t,
         (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated, Ptr{Cvoid}),
-        n_dim, shape, type, Ref(value))
+        Int32(N), shape, type, Ref(value))
 
-    return NDArray(ptr; T=T, n_dim=n_dim)
+    return NDArray(ptr; T=T, n_dim=N)
 end
 
 function nda_random(arr::NDArray, gen_code)
@@ -408,7 +408,10 @@ end
 
 Return the size of the given `NDArray`. This will include the padded size.
 """
-padded_shape(arr::NDArray) = Tuple(Int.(cuNumeric.nda_array_shape(arr)))
+function padded_shape(arr::NDArray{<:Any,N}) where {N}
+    shp = cuNumeric.nda_array_shape(arr) 
+    return ntuple(i -> Int(shp[i]), Val(N))
+end
 
 @doc"""
     shape(arr::NDArray)
diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 02ec68e5..f214eca6 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -220,7 +220,7 @@ size(arr)
 size(arr, 2)
 ```
 """
-Base.size(arr::NDArray) = cuNumeric.shape(arr)
+Base.size(arr::NDArray{<:Any, N}) where N = cuNumeric.shape(arr)
 Base.size(arr::NDArray, dim::Int) = Base.size(arr)[dim]
 
 @doc"""
@@ -253,12 +253,16 @@ Base.IndexStyle(::NDArray) = IndexCartesian()
 
 function Base.show(io::IO, arr::NDArray{T,0}) where {T}
     println(io, "0-dimensional NDArray{$(T),0}")
-    print(io, arr[]) #! should I assert scalar??
+    allowscalar() do
+        print(io, arr[])
+    end
 end
 
 function Base.show(io::IO, ::MIME"text/plain", arr::NDArray{T,0}) where {T}
     println(io, "0-dimensional NDArray{$(T),0}")
-    print(io, arr[]) #! should I assert scalar??
+    allowscalar() do
+        print(io, arr[])
+    end
 end
 
 function Base.show(io::IO, arr::NDArray{T,D}) where {T,D}
@@ -304,7 +308,7 @@ Assignment also supports:
 
 # Examples
 ```@repl
-A = cuNumeric.full((3, 3), 1.0);
+A = cuNumeric.fill(1.0, (3, 3));
 A[1, 2]
 A[1:2, 2:3] = cuNumeric.ones(2, 2);
 A[:, 1] = 5.0;
@@ -461,25 +465,27 @@ Base.fill!(arr::NDArray{T}, val::T) where {T} = nda_fill_array(arr, val)
 
 #### INITIALIZATION OF NDARRAYS ####
 @doc"""
-    cuNumeric.full(dims::Tuple, val)
-    cuNumeric.full(dim::Int, val)
+    cuNumeric.fill(val::T, dims::Dims)
+    cuNumeric.fill(val::T, dims::Int...)
 
 Create an `NDArray` filled with the scalar value `val`, with the shape specified by `dims`.
 
 # Examples
 ```@repl
-cuNumeric.full((2, 3), 7.5)
-cuNumeric.full(4, 0)
+cuNumeric.fill(7.5, (2, 3))
+cuNumeric.fill(0, 4)
 ```
 """
-function full(dims::Dims, val::T) where {T<:SUPPORTED_TYPES}
-    shape = collect(UInt64, dims)
-    return nda_full_array(shape, val)
+function fill(val::T, dims::Dims) where {T<:SUPPORTED_TYPES}
+    return nda_full_array(dims, val)
 end
 
-function full(dim::Int, val::T) where {T<:SUPPORTED_TYPES}
-    shape = UInt64[dim]
-    return nda_full_array(shape, val)
+function fill(val::T, dims::Int...) where {T<:SUPPORTED_TYPES}
+    return fill(val, dims)
+end
+
+function fill(val::T, dim::Int) where {T<:SUPPORTED_TYPES}
+    return fill(val, (dim,))
 end
 
 @doc"""
@@ -494,9 +500,9 @@ Create an `NDArray` filled with the true, with the shape specified by `dims`.
 cuNumeric.trues(2, 3)
 ```
 """
-trues(dim::Int) = cuNumeric.full(dim, true)
-trues(dims::Dims) = cuNumeric.full(dims, true)
-trues(dims::Int...) = cuNumeric.full(dims, true)
+trues(dim::Int) = cuNumeric.fill(true, dim)
+trues(dims::Dims) = cuNumeric.fill(true, dims)
+trues(dims::Int...) = cuNumeric.fill(true, dims)
 
 @doc"""
     cuNumeric.falses(dims::Tuple, val)
@@ -510,9 +516,10 @@ Create an `NDArray` filled with the false, with the shape specified by `dims`.
 cuNumeric.falses(2, 3)
 ```
 """
-falses(dim::Int) = cuNumeric.full(dim, false)
-falses(dims::Dims) = cuNumeric.full(dims, false)
-falses(dims::Int...) = cuNumeric.full(dims, false)
+falses(dims::Dims) = cuNumeric.fill(false, dims)
+falses(dims::Int...) = cuNumeric.fill(false, dims)
+falses(dim::Int) = cuNumeric.fill(false, dim)
+
 
 @doc"""
     cuNumeric.zeros([T=Float32,] dims::Int...)
@@ -528,9 +535,8 @@ cuNumeric.zeros(Float64, 3)
 cuNumeric.zeros(Int32, (2,3))
 ```
 """
-function zeros(::Type{T}, dims::Dims) where {T<:SUPPORTED_TYPES}
-    shape = collect(UInt64, dims)
-    return nda_zeros_array(shape, T)
+function zeros(::Type{T}, dims::Dims{N}) where {T<:SUPPORTED_TYPES, N}
+    return nda_zeros_array(dims, T)
 end
 
 function zeros(::Type{T}, dims::Int...) where {T<:SUPPORTED_TYPES}
@@ -546,15 +552,16 @@ function zeros(dims::Int...)
 end
 
 function zeros(::Type{T}) where {T}
-    return nda_zeros_array(UInt64[], T)
+    return nda_zeros_array((), T)
 end
 
 function zeros()
     return zeros(DEFAULT_FLOAT)
 end
 
-function zeros_like(arr::NDArray)
-    return zeros(eltype(arr), Base.size(arr))
+#* TYPE USNTABLE CAUSE SIZE IS RIGHT NOW
+function zeros_like(arr::NDArray{T,N}) where {T,N}
+    return zeros(T, Base.size(arr))
 end
 
 @doc"""
@@ -572,7 +579,7 @@ cuNumeric.ones(Int32, (2, 3))
 ```
 """
 function ones(::Type{T}, dims::Dims) where {T}
-    return full(dims, T(1))
+    return nda_full_array(dims, T(1))
 end
 
 function ones(::Type{T}, dims::Int...) where {T}
@@ -587,12 +594,13 @@ function ones(dims::Int...)
     return ones(DEFAULT_FLOAT, dims)
 end
 
+#* UNSTABLE
 function ones(::Type{T}) where {T}
-    return full((), T(1))
+    return cuNumeric.fill((), T(1))
 end
 
 function ones()
-    return zeros(DEFAULT_FLOAT)
+    return ones(DEFAULT_FLOAT)
 end
 
 @doc"""
@@ -645,11 +653,13 @@ reshape(arr, 12)
 ```
 """
 
+#*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
 function reshape(arr::NDArray, i::Dims{N}; copy::Bool=false) where {N}
     reshaped = nda_reshape_array(arr, UInt64.(collect(i)))
     return copy ? copy(reshaped) : reshaped
 end
 
+#*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
 function reshape(arr::NDArray, i::Int64; copy::Bool=false)
     reshaped = nda_reshape_array(arr, UInt64.([i]))
     return copy ? copy(reshaped) : reshaped

From eb0d844cbc932b2c0aba9a92a90d2fc275e06446 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 25 Feb 2026 15:01:06 -0600
Subject: [PATCH 2/8] zeros, ones, fill type stable

---
 src/ndarray/binary.jl         | 16 ++++-----
 src/ndarray/detail/ndarray.jl | 62 +++++++++++++++--------------------
 src/ndarray/ndarray.jl        |  3 +-
 src/ndarray/unary.jl          | 18 +++++-----
 4 files changed, 44 insertions(+), 55 deletions(-)

diff --git a/src/ndarray/binary.jl b/src/ndarray/binary.jl
index 10974206..a8715240 100644
--- a/src/ndarray/binary.jl
+++ b/src/ndarray/binary.jl
@@ -87,7 +87,7 @@ function Base.:(-)(rhs1::NDArray{A,N}, rhs2::NDArray{B,N}) where {A,B,N}
     promote_shape(size(rhs1), size(rhs2))
     T_OUT = __checked_promote_op(-, A, B)
     out = cuNumeric.zeros(T_OUT, size(rhs1))
-    return nda_binary_op(
+    return nda_binary_op!(
         out,
         cuNumeric.SUBTRACT,
         unchecked_promote_arr(rhs1, T_OUT),
@@ -100,7 +100,7 @@ function Base.:(+)(rhs1::NDArray{A,N}, rhs2::NDArray{B,N}) where {A,B,N}
     promote_shape(size(rhs1), size(rhs2))
     T_OUT = __checked_promote_op(+, A, B)
     out = cuNumeric.zeros(T_OUT, size(rhs1))
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.ADD, unchecked_promote_arr(rhs1, T_OUT), unchecked_promote_arr(rhs2, T_OUT)
     )
 end
@@ -108,7 +108,7 @@ end
 function Base.:(*)(val::V, arr::NDArray{A}) where {A,V}
     T = __my_promote_type(A, V)
     out = cuNumeric.zeros(T, size(arr))
-    return nda_binary_op(out, cuNumeric.MULTIPLY, NDArray(T(val)), unchecked_promote_arr(arr, T))
+    return nda_binary_op!(out, cuNumeric.MULTIPLY, NDArray(T(val)), unchecked_promote_arr(arr, T))
 end
 
 function Base.:(*)(arr::NDArray{A}, val::V) where {A,V}
@@ -191,7 +191,7 @@ for (julia_fn, op_code) in binary_op_map
         @inline function __broadcast(
             f::typeof($(julia_fn)), out::NDArray, rhs1::NDArray{T}, rhs2::NDArray{T}
         ) where {T}
-            return nda_binary_op(out, $(op_code), rhs1, rhs2)
+            return nda_binary_op!(out, $(op_code), rhs1, rhs2)
         end
     end
 end
@@ -204,7 +204,7 @@ for (julia_fn, op_code) in floaty_binary_op_map
         @inline function __broadcast(
             f::typeof($(julia_fn)), out::NDArray, rhs1::NDArray{T}, rhs2::NDArray{T}
         ) where {T}
-            return nda_binary_op(out, $(op_code), rhs1, rhs2)
+            return nda_binary_op!(out, $(op_code), rhs1, rhs2)
         end
 
         # If input is not already float, promote to that
@@ -220,7 +220,7 @@ end
     f::typeof(Base.:(+)), out::NDArray{O}, rhs1::NDArray{Bool}, rhs2::NDArray{Bool}
 ) where {O<:Integer}
     assertpromotion(".+", Bool, O)
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.ADD, unchecked_promote_arr(rhs1, O), unchecked_promote_arr(rhs2, O)
     )
 end
@@ -229,7 +229,7 @@ end
     f::typeof(Base.:(-)), out::NDArray{O}, rhs1::NDArray{Bool}, rhs2::NDArray{Bool}
 ) where {O<:Integer}
     assertpromotion(".-", Bool, O)
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.SUBTRACT, unchecked_promote_arr(rhs1, O), unchecked_promote_arr(rhs2, O)
     )
 end
@@ -250,7 +250,7 @@ end
 @inline function __broadcast(
     f::typeof(Base.literal_pow), out::NDArray, _, input::NDArray{T}, power::NDArray{T}
 ) where {T}
-    return nda_binary_op(out, cuNumeric.POWER, input, power)
+    return nda_binary_op!(out, cuNumeric.POWER, input, power)
 end
 
 # This is more "Julian" since a user expects map to broadcast
diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index f124bc80..2dac42dd 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -28,21 +28,19 @@ get_n_dim(ptr::NDArray_t) = Int(ccall((:nda_array_dim, libnda), Int32, (NDArray_
 abstract type AbstractNDArray{T<:SUPPORTED_TYPES,N} end
 
 @doc"""
-**Internal API**
-
 The NDArray type represents a multi-dimensional array in cuNumeric.
 It is a wrapper around a Legate array and provides various methods for array manipulation and operations. 
 Finalizer calls `nda_destroy_array` to clean up the underlying Legate array when the NDArray is garbage collected.
 """
-mutable struct NDArray{T,N} <: AbstractNDArray{T,N}
+mutable struct NDArray{T, N, PADDED} <: AbstractNDArray{T,N}
     ptr::NDArray_t
     nbytes::Int64
-    padding::Union{Nothing,NTuple{N,Int}} where {N}
+    padding::Union{Nothing,NTuple{N,Int}}
 
-    function NDArray(ptr::NDArray_t; T=get_julia_type(ptr), n_dim=get_n_dim(ptr))
+    function NDArray(ptr::NDArray_t, ::Type{T}, ::Val{N}) where {T, N}
         nbytes = cuNumeric.nda_nbytes(ptr)
         cuNumeric.register_alloc!(nbytes)
-        handle = new{T,Int(n_dim)}(ptr, nbytes, nothing)
+        handle = new{T,N, false}(ptr, nbytes, nothing)
         finalizer(handle) do h
             cuNumeric.nda_destroy_array(h.ptr)
             cuNumeric.register_free!(h.nbytes)
@@ -51,6 +49,9 @@ mutable struct NDArray{T,N} <: AbstractNDArray{T,N}
     end
 end
 
+# Dynamic fallback, not great but required if we cannot infer things
+NDArray(ptr::NDArray_t) = NDArray(ptr, get_julia_type(ptr), Val(get_n_dim(ptr)))
+
 # struct WrappedNDArray{T,N} <: AbstractNDArray{T,N}
 #     ndarr::NDArray{T,N}
 #     jlarr::Array{T,N}
@@ -86,7 +87,7 @@ function nda_zeros_array(dims::Dims{N}, ::Type{T}) where {T, N}
     ptr = ccall((:nda_zeros_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated),
         Int32(N), shape, legate_type)
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_full_array(dims::Dims{N}, value::T) where {T, N}
@@ -98,7 +99,7 @@ function nda_full_array(dims::Dims{N}, value::T) where {T, N}
         (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated, Ptr{Cvoid}),
         Int32(N), shape, type, Ref(value))
 
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_random(arr::NDArray, gen_code)
@@ -112,14 +113,14 @@ function nda_random_array(shape::Vector{UInt64})
     ptr = ccall((:nda_random_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}),
         n_dim, shape)
-    return NDArray(ptr; n_dim=n_dim)
+    return NDArray(ptr, get_julia_type(ptr), Val(n_dim))
 end
 
 function nda_get_slice(arr::NDArray{T,N}, slices::Vector{Slice}) where {T,N}
     ptr = ccall((:nda_get_slice, libnda),
         NDArray_t, (NDArray_t, Ptr{Slice}, Cint),
         arr.ptr, pointer(slices), length(slices))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 # queries
@@ -147,7 +148,7 @@ function nda_reshape_array(arr::NDArray{T}, newshape::Vector{UInt64}) where {T}
     ptr = ccall((:nda_reshape_array, libnda),
         NDArray_t, (NDArray_t, Int32, Ptr{UInt64}),
         arr.ptr, n_dim, newshape)
-    return NDArray(ptr; T=T, n_dim=n_dim)
+    return NDArray(ptr, T, Val(n_dim))
 end
 
 function nda_astype(arr::NDArray{OLD_T,N}, ::Type{NEW_T}) where {OLD_T,NEW_T,N}
@@ -156,7 +157,7 @@ function nda_astype(arr::NDArray{OLD_T,N}, ::Type{NEW_T}) where {OLD_T,NEW_T,N}
         NDArray_t,
         (NDArray_t, Legate.LegateTypeAllocated),
         arr.ptr, type)
-    return NDArray(ptr; T=NEW_T, n_dim=N)
+    return NDArray(ptr, NEW_T, Val(N))
 end
 
 function nda_fill_array(arr::NDArray{T}, value::T) where {T}
@@ -193,14 +194,14 @@ function nda_move(dst::NDArray{T,N}, src::NDArray{T,N}) where {T,N}
 end
 
 # operations 
-function nda_binary_op(out::NDArray, op_code::BinaryOpCode, rhs1::NDArray, rhs2::NDArray)
+function nda_binary_op!(out::NDArray, op_code::BinaryOpCode, rhs1::NDArray, rhs2::NDArray)
     ccall((:nda_binary_op, libnda),
         Cvoid, (NDArray_t, BinaryOpCode, NDArray_t, NDArray_t),
         out.ptr, op_code, rhs1.ptr, rhs2.ptr)
     return out
 end
 
-function nda_unary_op(out::NDArray, op_code::UnaryOpCode, input::NDArray)
+function nda_unary_op!(out::NDArray, op_code::UnaryOpCode, input::NDArray)
     ccall((:nda_unary_op, libnda),
         Cvoid, (NDArray_t, UnaryOpCode, NDArray_t),
         out.ptr, op_code, input.ptr)
@@ -218,7 +219,7 @@ function nda_array_equal(rhs1::NDArray{T,N}, rhs2::NDArray{T,N}) where {T,N}
     ptr = ccall((:nda_array_equal, libnda),
         NDArray_t, (NDArray_t, NDArray_t),
         rhs1.ptr, rhs2.ptr)
-    return NDArray(ptr; T=Bool, n_dim=1)
+    return NDArray(ptr, Bool, Val(1))
 end
 
 function nda_diag(arr::NDArray, k::Int32)
@@ -255,7 +256,7 @@ function nda_multiply_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
     ptr = ccall((:nda_multiply_scalar, libnda),
         NDArray_t, (NDArray_t, Legate.LegateTypeAllocated, Ptr{Cvoid}),
         rhs1.ptr, type, Ref(value))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_add_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
@@ -264,7 +265,7 @@ function nda_add_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
     ptr = ccall((:nda_add_scalar, libnda),
         NDArray_t, (NDArray_t, Legate.LegateTypeAllocated, Ptr{Cvoid}),
         rhs1.ptr, type, Ref(value))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_three_dot_arg(rhs1::NDArray{T}, rhs2::NDArray{T}, out::NDArray{T}) where {T}
@@ -286,7 +287,7 @@ function nda_eye(rows::Int32, ::Type{T}) where {T}
     ptr = ccall((:nda_eye, libnda),
         NDArray_t, (Int32, Legate.LegateTypeAllocated),
         rows, legate_type)
-    return NDArray(ptr; T=T, n_dim=2)
+    return NDArray(ptr, T, Val(2))
 end
 
 function nda_trace(
@@ -297,7 +298,7 @@ function nda_trace(
         NDArray_t,
         (NDArray_t, Int32, Int32, Int32, Legate.LegateTypeAllocated),
         arr.ptr, offset, a1, a2, legate_type)
-    return NDArray(ptr; T=T, n_dim=1)
+    return NDArray(ptr, T, Val(1))
 end
 
 function nda_transpose(arr::NDArray)
@@ -317,7 +318,7 @@ function nda_attach_external(arr::AbstractArray{T,N}) where {T,N}
         NDArray_t, (Ptr{Cvoid}, UInt64, Int32, Ptr{UInt64}, Legate.LegateTypeAllocated),
         ptr, nbytes, N, shape, legate_type)
 
-    return NDArray(nda_ptr; T=T, n_dim=N)
+    return NDArray(nda_ptr, T, Val(N))
 end
 
 # return underlying logical store to the NDArray obj
@@ -401,17 +402,6 @@ function slice_array(slices::Vararg{Tuple{Union{Int,Nothing},Union{Int,Nothing}}
     return v
 end
 
-@doc"""
-    padded_shape(arr::NDArray)
-
-**Internal API**
-
-Return the size of the given `NDArray`. This will include the padded size.
-"""
-function padded_shape(arr::NDArray{<:Any,N}) where {N}
-    shp = cuNumeric.nda_array_shape(arr) 
-    return ntuple(i -> Int(shp[i]), Val(N))
-end
 
 @doc"""
     shape(arr::NDArray)
@@ -420,11 +410,11 @@ end
 
 Return the size of the given `NDArray`.
 """
-function shape(arr::NDArray)
-    if !isnothing(arr.padding)
-        return arr.padding
-    end
-    return cuNumeric.padded_shape(arr)
+shape(arr::NDArray{<:Any, N, true}) where N = arr.padding
+
+function shape(arr::NDArray{<:Any, N, false}) where {N}
+    shp = cuNumeric.nda_array_shape(arr) 
+    return ntuple(i -> Int(shp[i]), Val(N))
 end
 
 @doc"""
diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index f214eca6..5c8b5b6e 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -594,9 +594,8 @@ function ones(dims::Int...)
     return ones(DEFAULT_FLOAT, dims)
 end
 
-#* UNSTABLE
 function ones(::Type{T}) where {T}
-    return cuNumeric.fill((), T(1))
+    return cuNumeric.fill(T(1), ())
 end
 
 function ones()
diff --git a/src/ndarray/unary.jl b/src/ndarray/unary.jl
index 4158b010..3b12e73d 100644
--- a/src/ndarray/unary.jl
+++ b/src/ndarray/unary.jl
@@ -101,13 +101,13 @@ global const unary_op_map_no_args = Dict{Function,UnaryOpCode}(
 ### SPECIAL CASES ###
 
 # Needed to support != 
-Base.:(!)(input::NDArray{Bool,0}) = nda_unary_op(similar(input), cuNumeric.LOGICAL_NOT, input)
-Base.:(!)(input::NDArray{Bool,1}) = nda_unary_op(similar(input), cuNumeric.LOGICAL_NOT, input)
+Base.:(!)(input::NDArray{Bool,0}) = nda_unary_op!(similar(input), cuNumeric.LOGICAL_NOT, input)
+Base.:(!)(input::NDArray{Bool,1}) = nda_unary_op!(similar(input), cuNumeric.LOGICAL_NOT, input)
 
 # Non-broadcasted version of negation
 function Base.:(-)(input::NDArray{T}) where {T}
     out = cuNumeric.zeros(T, size(input))
-    return nda_unary_op(out, cuNumeric.NEGATIVE, input)
+    return nda_unary_op!(out, cuNumeric.NEGATIVE, input)
 end
 
 function Base.:(-)(input::NDArray{Bool})
@@ -121,7 +121,7 @@ end
 @inline function __broadcast(
     f::typeof(Base.literal_pow), out::NDArray{O}, _, input::NDArray{T}, ::Type{Val{2}}
 ) where {T,O}
-    return nda_unary_op(out, cuNumeric.SQUARE, input)
+    return nda_unary_op!(out, cuNumeric.SQUARE, input)
 end
 
 @inline function __broadcast(
@@ -129,13 +129,13 @@ end
 ) where {O}
     nda_move(out, O(1) ./ checked_promote_arr(input, O)) #! REPLACE WITH RECIP ONCE FIXED
     return out
-    # return nda_unary_op(out, cuNumeric.RECIPROCAL, input)
+    # return nda_unary_op!(out, cuNumeric.RECIPROCAL, input)
 end
 
 @inline function __broadcast(::typeof(Base.inv), out::NDArray{O}, input::NDArray) where {O}
     nda_move(out, O(1) ./ checked_promote_arr(input, O)) #! REPLACE WITH RECIP ONCE FIXED
     return out
-    # return nda_unary_op(out, cuNumeric.RECIPROCAL, checked_promote_arr(input,O))
+    # return nda_unary_op!(out, cuNumeric.RECIPROCAL, checked_promote_arr(input,O))
 end
 
 #! NEEDS TO SUPPORT inv and ^ -1
@@ -150,7 +150,7 @@ end
 
 # Only supported for Bools
 @inline function __broadcast(f::typeof(Base.:(!)), out::NDArray{Bool}, input::NDArray{Bool})
-    return nda_unary_op(out, cuNumeric.LOGICAL_NOT, input)
+    return nda_unary_op!(out, cuNumeric.LOGICAL_NOT, input)
 end
 
 # Generate hidden broadcasted version of unary ops.
@@ -159,7 +159,7 @@ for (julia_fn, op_code) in unary_op_map_no_args
         @inline function __broadcast(
             f::typeof($julia_fn), out::NDArray{T}, input::NDArray{T}
         ) where {T}
-            return nda_unary_op(out, $(op_code), input)
+            return nda_unary_op!(out, $(op_code), input)
         end
     end
 end
@@ -172,7 +172,7 @@ for (julia_fn, op_code) in floaty_unary_ops_no_args
         @inline function __broadcast(
             f::typeof($julia_fn), out::NDArray{T}, input::NDArray{T}
         ) where {T}
-            return nda_unary_op(out, $(op_code), input)
+            return nda_unary_op!(out, $(op_code), input)
         end
 
         # If input is not already float, promote to that

From 6e7c36afcfef15f4aedb5f571a997a2c9416dec3 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 25 Feb 2026 15:15:09 -0600
Subject: [PATCH 3/8] start rand

---
 src/ndarray/detail/ndarray.jl | 10 +++++-----
 src/ndarray/ndarray.jl        |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 2dac42dd..7e216fcb 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -50,7 +50,7 @@ mutable struct NDArray{T, N, PADDED} <: AbstractNDArray{T,N}
 end
 
 # Dynamic fallback, not great but required if we cannot infer things
-NDArray(ptr::NDArray_t) = NDArray(ptr, get_julia_type(ptr), Val(get_n_dim(ptr)))
+NDArray(ptr::NDArray_t; T = get_julia_type(ptr), N::Integer = get_n_dim(ptr)) = NDArray(ptr, T, Val(N))
 
 # struct WrappedNDArray{T,N} <: AbstractNDArray{T,N}
 #     ndarr::NDArray{T,N}
@@ -108,12 +108,12 @@ function nda_random(arr::NDArray, gen_code)
         arr.ptr, Int32(gen_code))
 end
 
-function nda_random_array(shape::Vector{UInt64})
-    n_dim = Int32(length(shape))
+function nda_random_array(dims::Dims{N}) where {N}
+    shape = collect(UInt64, dims)
     ptr = ccall((:nda_random_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}),
-        n_dim, shape)
-    return NDArray(ptr, get_julia_type(ptr), Val(n_dim))
+        Int32(N), shape)
+    return NDArray(ptr, Float64, Val(N)) #* T is always Float64 cause of cupynumeric
 end
 
 function nda_get_slice(arr::NDArray{T,N}, slices::Vector{Slice}) where {T,N}
diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 5c8b5b6e..71337c37 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -624,14 +624,13 @@ A = cuNumeric.zeros(2, 2); cuNumeric.rand!(A)
 ```
 """
 Random.rand!(arr::NDArray{Float64}) = cuNumeric.nda_random(arr, 0)
-rand(::Type{NDArray}, dims::Dims) = cuNumeric.nda_random_array(UInt64.(collect(dims)))
+rand(::Type{NDArray}, dims::Dims) = cuNumeric.nda_random_array(dims)
 rand(::Type{NDArray}, dims::Int...) = cuNumeric.rand(NDArray, dims)
 rand(dims::Dims) = cuNumeric.rand(NDArray, dims)
 rand(dims::Int...) = cuNumeric.rand(NDArray, dims)
 
 function rand(::Type{T}, dims::Dims) where {T<:AbstractFloat}
-    arrfp64 = cuNumeric.nda_random_array(UInt64.(collect(dims)))
-    # if T == Float64, as_type should do minimial work # TODO check this.
+    arrfp64 = cuNumeric.nda_random_array(dims)
     return cuNumeric.as_type(arrfp64, T)
 end
 

From 42cdb1727ee3a2ba033a9c68f2d032c434d0e7ee Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Wed, 25 Feb 2026 16:45:55 -0600
Subject: [PATCH 4/8] rand/rand! and most broadcasting  stable

---
 src/ndarray/ndarray.jl  |  1 -
 test/tests/stability.jl | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 test/tests/stability.jl

diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 71337c37..53f2a5a5 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -559,7 +559,6 @@ function zeros()
     return zeros(DEFAULT_FLOAT)
 end
 
-#* TYPE USNTABLE CAUSE SIZE IS RIGHT NOW
 function zeros_like(arr::NDArray{T,N}) where {T,N}
     return zeros(T, Base.size(arr))
 end
diff --git a/test/tests/stability.jl b/test/tests/stability.jl
new file mode 100644
index 00000000..5ef3d3fb
--- /dev/null
+++ b/test/tests/stability.jl
@@ -0,0 +1,23 @@
+@testset "Stability" begin
+
+    @testset "core" begin
+        # size, shape, NDArray constructor
+    end
+
+    @testset "construction" begin
+        # zeros, zeros_like, ones, rand, fill, trues, falses
+    end
+
+    @testset "indexing" begin
+        # getindex, setindex!, copy, copyto!, fill!, as_type
+    end
+
+    @testset "arithmetic" begin
+        # +, -, *, /, ^, %, &, |, ⊻
+    end
+
+    @testset "linear algebra" begin
+        # mul!, dot, norm, det, inv, pinv, eig, svd, lu, qr, cholesky
+    end
+
+end
\ No newline at end of file

From cd4da3739fa45a61a88fb4e8a5f1aad29a20b0d8 Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Mon, 2 Mar 2026 14:38:37 -0600
Subject: [PATCH 5/8] add tests for type stability

---
 src/ndarray/detail/ndarray.jl |  8 ++--
 src/ndarray/ndarray.jl        |  7 ++--
 test/runtests.jl              |  4 ++
 test/tests/stability.jl       | 73 +++++++++++++++++++++++++++++++----
 4 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 7e216fcb..7d0c8c2b 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -143,12 +143,12 @@ function nda_array_shape(arr::NDArray)
 end
 
 # modify
-function nda_reshape_array(arr::NDArray{T}, newshape::Vector{UInt64}) where {T}
-    n_dim = Int32(length(newshape))
+function nda_reshape_array(arr::NDArray{T}, newdims::Dims{N}) where {T, N}
+    newshape = collect(UInt64, newdims)
     ptr = ccall((:nda_reshape_array, libnda),
         NDArray_t, (NDArray_t, Int32, Ptr{UInt64}),
-        arr.ptr, n_dim, newshape)
-    return NDArray(ptr, T, Val(n_dim))
+        arr.ptr, Int32(N), newshape)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_astype(arr::NDArray{OLD_T,N}, ::Type{NEW_T}) where {OLD_T,NEW_T,N}
diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 53f2a5a5..9993d903 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -652,14 +652,13 @@ reshape(arr, 12)
 
 #*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
 function reshape(arr::NDArray, i::Dims{N}; copy::Bool=false) where {N}
-    reshaped = nda_reshape_array(arr, UInt64.(collect(i)))
+    reshaped = nda_reshape_array(arr, i)
     return copy ? copy(reshaped) : reshaped
 end
 
 #*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
-function reshape(arr::NDArray, i::Int64; copy::Bool=false)
-    reshaped = nda_reshape_array(arr, UInt64.([i]))
-    return copy ? copy(reshaped) : reshaped
+function reshape(arr::NDArray, i::Int...; copy::Bool=false)
+    return reshape(arr, i; copy = copy)
 end
 
 # Ignore the scalar indexing here...
diff --git a/test/runtests.jl b/test/runtests.jl
index 60137c4c..4df77506 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -380,6 +380,10 @@ end
     end
 end
 
+@testset verbose = true "Type Stability" begin
+    include("tests/stability.jl")
+end
+
 @testset verbose = true "Scoping" begin
     N = 100
 
diff --git a/test/tests/stability.jl b/test/tests/stability.jl
index 5ef3d3fb..010e251b 100644
--- a/test/tests/stability.jl
+++ b/test/tests/stability.jl
@@ -1,23 +1,82 @@
 @testset "Stability" begin
 
     @testset "core" begin
-        # size, shape, NDArray constructor
+        a = cuNumeric.zeros(5)
+        b = cuNumeric.zeros(Float64, 3, 4)
+        @inferred size(a)
+        @inferred size(b)
+        @inferred cuNumeric.shape(a)
+        @inferred cuNumeric.shape(b)
     end
 
     @testset "construction" begin
-        # zeros, zeros_like, ones, rand, fill, trues, falses
+        # zeros, zeros_like, ones, rand, fill, trues, falses\
+        for constructor in (:zeros, :ones)
+            @eval begin
+                @inferred cuNumeric.$(constructor)(Float64, 3, 2)
+                @inferred cuNumeric.$(constructor)(Float64, (3, 4))
+                @inferred cuNumeric.$(constructor)(3, 5, 6)
+                @inferred cuNumeric.$(constructor)((3,))
+                @inferred cuNumeric.$(constructor)()
+                @inferred cuNumeric.$(constructor)(Int64)
+            end
+        end
+        a = cuNumeric.zeros(Float64, 5, 3)
+        @inferred cuNumeric.zeros_like(a)
+
+        for constructor in (:trues, :falses)
+            @eval begin
+                @inferred cuNumeric.$(constructor)(5)
+                @inferred cuNumeric.$(constructor)((5,4))
+                @inferred cuNumeric.$(constructor)(3, 4, 5)
+            end
+        end
+
+        @inferred cuNumeric.fill(2.0, 3, 4)
+        @inferred cuNumeric.fill(2, (3, 4))
+        @inferred cuNumeric.fill(2.0, 3)
+
+        @inferred cuNumeric.rand(4, 3)
+        @inferred cuNumeric.rand(Float32, 5)
+    end
+
+    @testset "conversion" begin
+        # cast to array, as_type
+        a = cuNumeric.zeros(Float64, 5, 5)
+        @inferred Array(a)
+        @inferred Array{Float32}(a)
+        @inferred cuNumeric.as_type(a, Float32)
+        @inferred cuNumeric.as_type(a, Int64)
     end
 
     @testset "indexing" begin
         # getindex, setindex!, copy, copyto!, fill!, as_type
-    end
+        a = cuNumeric.zeros(Float32, 5, 5)
+        b = cuNumeric.zeros(Int32, 11)
 
-    @testset "arithmetic" begin
-        # +, -, *, /, ^, %, &, |, ⊻
+        @inferred a[1:3, 1:3]
+        @inferred a[2, 1:3]
+        @inferred a[1, 1:3] .+ b[1:3]
+        @inferred b[1:5]
+        # @inferred a[1:3, 1:end]
+        allowscalar() do
+            @inferred a[1, 2]
+        end
     end
 
-    @testset "linear algebra" begin
-        # mul!, dot, norm, det, inv, pinv, eig, svd, lu, qr, cholesky
+    @testset "broadcasting" begin
+        a = cuNumeric.ones(Float32, 3, 3)
+        b = cuNumeric.ones(Int32, 3, 3)
+        @inferred 5 .* a
+        @inferred 5.0f0 .* a
+        @inferred 5 * a
+        @inferred 5.0f0 * a
+
+        @inferred a .* b
+        @inferred a .+ b
+        @inferred a ./ b
+        @inferred ((a .* b) .+ a) .* 2.0f0
     end
 
+
 end
\ No newline at end of file

From 4f90b13ec5f5a47c59d24efde24b5fabe8bb71bb Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Mon, 2 Mar 2026 14:42:15 -0600
Subject: [PATCH 6/8] this?

---
 test/tests/stability.jl | 131 +++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 68 deletions(-)

diff --git a/test/tests/stability.jl b/test/tests/stability.jl
index 010e251b..137ac9e9 100644
--- a/test/tests/stability.jl
+++ b/test/tests/stability.jl
@@ -1,82 +1,77 @@
-@testset "Stability" begin
+@testset "core" begin
+    a = cuNumeric.zeros(5)
+    b = cuNumeric.zeros(Float64, 3, 4)
+    @inferred size(a)
+    @inferred size(b)
+    @inferred cuNumeric.shape(a)
+    @inferred cuNumeric.shape(b)
+end
 
-    @testset "core" begin
-        a = cuNumeric.zeros(5)
-        b = cuNumeric.zeros(Float64, 3, 4)
-        @inferred size(a)
-        @inferred size(b)
-        @inferred cuNumeric.shape(a)
-        @inferred cuNumeric.shape(b)
-    end
-
-    @testset "construction" begin
-        # zeros, zeros_like, ones, rand, fill, trues, falses\
-        for constructor in (:zeros, :ones)
-            @eval begin
-                @inferred cuNumeric.$(constructor)(Float64, 3, 2)
-                @inferred cuNumeric.$(constructor)(Float64, (3, 4))
-                @inferred cuNumeric.$(constructor)(3, 5, 6)
-                @inferred cuNumeric.$(constructor)((3,))
-                @inferred cuNumeric.$(constructor)()
-                @inferred cuNumeric.$(constructor)(Int64)
-            end
+@testset "construction" begin
+    # zeros, zeros_like, ones, rand, fill, trues, falses\
+    for constructor in (:zeros, :ones)
+        @eval begin
+            @inferred cuNumeric.$(constructor)(Float64, 3, 2)
+            @inferred cuNumeric.$(constructor)(Float64, (3, 4))
+            @inferred cuNumeric.$(constructor)(3, 5, 6)
+            @inferred cuNumeric.$(constructor)((3,))
+            @inferred cuNumeric.$(constructor)()
+            @inferred cuNumeric.$(constructor)(Int64)
         end
-        a = cuNumeric.zeros(Float64, 5, 3)
-        @inferred cuNumeric.zeros_like(a)
+    end
+    a = cuNumeric.zeros(Float64, 5, 3)
+    @inferred cuNumeric.zeros_like(a)
 
-        for constructor in (:trues, :falses)
-            @eval begin
-                @inferred cuNumeric.$(constructor)(5)
-                @inferred cuNumeric.$(constructor)((5,4))
-                @inferred cuNumeric.$(constructor)(3, 4, 5)
-            end
+    for constructor in (:trues, :falses)
+        @eval begin
+            @inferred cuNumeric.$(constructor)(5)
+            @inferred cuNumeric.$(constructor)((5,4))
+            @inferred cuNumeric.$(constructor)(3, 4, 5)
         end
-
-        @inferred cuNumeric.fill(2.0, 3, 4)
-        @inferred cuNumeric.fill(2, (3, 4))
-        @inferred cuNumeric.fill(2.0, 3)
-
-        @inferred cuNumeric.rand(4, 3)
-        @inferred cuNumeric.rand(Float32, 5)
     end
 
-    @testset "conversion" begin
-        # cast to array, as_type
-        a = cuNumeric.zeros(Float64, 5, 5)
-        @inferred Array(a)
-        @inferred Array{Float32}(a)
-        @inferred cuNumeric.as_type(a, Float32)
-        @inferred cuNumeric.as_type(a, Int64)
-    end
+    @inferred cuNumeric.fill(2.0, 3, 4)
+    @inferred cuNumeric.fill(2, (3, 4))
+    @inferred cuNumeric.fill(2.0, 3)
 
-    @testset "indexing" begin
-        # getindex, setindex!, copy, copyto!, fill!, as_type
-        a = cuNumeric.zeros(Float32, 5, 5)
-        b = cuNumeric.zeros(Int32, 11)
+    @inferred cuNumeric.rand(4, 3)
+    @inferred cuNumeric.rand(Float32, 5)
+end
 
-        @inferred a[1:3, 1:3]
-        @inferred a[2, 1:3]
-        @inferred a[1, 1:3] .+ b[1:3]
-        @inferred b[1:5]
-        # @inferred a[1:3, 1:end]
-        allowscalar() do
-            @inferred a[1, 2]
-        end
-    end
+@testset "conversion" begin
+    # cast to array, as_type
+    a = cuNumeric.zeros(Float64, 5, 5)
+    @inferred Array(a)
+    @inferred Array{Float32}(a)
+    @inferred cuNumeric.as_type(a, Float32)
+    @inferred cuNumeric.as_type(a, Int64)
+end
 
-    @testset "broadcasting" begin
-        a = cuNumeric.ones(Float32, 3, 3)
-        b = cuNumeric.ones(Int32, 3, 3)
-        @inferred 5 .* a
-        @inferred 5.0f0 .* a
-        @inferred 5 * a
-        @inferred 5.0f0 * a
+@testset "indexing" begin
+    # getindex, setindex!, copy, copyto!, fill!, as_type
+    a = cuNumeric.zeros(Float32, 5, 5)
+    b = cuNumeric.zeros(Int32, 11)
 
-        @inferred a .* b
-        @inferred a .+ b
-        @inferred a ./ b
-        @inferred ((a .* b) .+ a) .* 2.0f0
+    @inferred a[1:3, 1:3]
+    @inferred a[2, 1:3]
+    @inferred a[1, 1:3] .+ b[1:3]
+    @inferred b[1:5]
+    # @inferred a[1:3, 1:end]
+    allowscalar() do
+        @inferred a[1, 2]
     end
+end
 
+@testset "broadcasting" begin
+    a = cuNumeric.ones(Float32, 3, 3)
+    b = cuNumeric.ones(Int32, 3, 3)
+    @inferred 5 .* a
+    @inferred 5.0f0 .* a
+    @inferred 5 * a
+    @inferred 5.0f0 * a
 
+    @inferred a .* b
+    @inferred a .+ b
+    @inferred a ./ b
+    @inferred ((a .* b) .+ a) .* 2.0f0
 end
\ No newline at end of file

From d95f968e7a702c53be212c22fed51c5c865c58dc Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Mon, 2 Mar 2026 14:50:36 -0600
Subject: [PATCH 7/8] all verbose tests

---
 test/tests/stability.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/tests/stability.jl b/test/tests/stability.jl
index 137ac9e9..d097454a 100644
--- a/test/tests/stability.jl
+++ b/test/tests/stability.jl
@@ -1,4 +1,4 @@
-@testset "core" begin
+@testset verbose = true "core" begin
     a = cuNumeric.zeros(5)
     b = cuNumeric.zeros(Float64, 3, 4)
     @inferred size(a)
@@ -7,7 +7,7 @@
     @inferred cuNumeric.shape(b)
 end
 
-@testset "construction" begin
+@testset verbose = true "construction" begin
     # zeros, zeros_like, ones, rand, fill, trues, falses\
     for constructor in (:zeros, :ones)
         @eval begin
@@ -38,7 +38,7 @@ end
     @inferred cuNumeric.rand(Float32, 5)
 end
 
-@testset "conversion" begin
+@testset verbose = true "conversion" begin
     # cast to array, as_type
     a = cuNumeric.zeros(Float64, 5, 5)
     @inferred Array(a)
@@ -47,7 +47,7 @@ end
     @inferred cuNumeric.as_type(a, Int64)
 end
 
-@testset "indexing" begin
+@testset verbose = true "indexing" begin
     # getindex, setindex!, copy, copyto!, fill!, as_type
     a = cuNumeric.zeros(Float32, 5, 5)
     b = cuNumeric.zeros(Int32, 11)
@@ -62,7 +62,7 @@ end
     end
 end
 
-@testset "broadcasting" begin
+@testset verbose = true "broadcasting" begin
     a = cuNumeric.ones(Float32, 3, 3)
     b = cuNumeric.ones(Int32, 3, 3)
     @inferred 5 .* a

From 5b962417d801fd12f3137c49dcb540073fea4dcc Mon Sep 17 00:00:00 2001
From: ejmeitz <emeitz@andrew.cmu.edu>
Date: Mon, 2 Mar 2026 14:59:18 -0600
Subject: [PATCH 8/8] fix ND printing bug

---
 src/ndarray/ndarray.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 9993d903..b703ceb9 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -267,7 +267,7 @@ end
 
 function Base.show(io::IO, arr::NDArray{T,D}) where {T,D}
     println(io, "NDArray{$(T),$(D)}")
-    Base.print_matrix(io, Array(arr))
+    Base.print_array(io, Array(arr))
 end
 
 function Base.show(io::IO, ::MIME"text/plain", arr::NDArray{T}) where {T}