diff --git a/examples/custom_cuda.jl b/examples/custom_cuda.jl
index 7c6d8511..18675c4f 100644
--- a/examples/custom_cuda.jl
+++ b/examples/custom_cuda.jl
@@ -24,8 +24,8 @@ N = 1024
 threads = 256
 blocks = cld(N, threads)
 
-a = cuNumeric.full(N, 1.0f0)
-b = cuNumeric.full(N, 2.0f0)
+a = cuNumeric.fill(1.0f0, N)
+b = cuNumeric.fill(2.0f0, N)
 c = cuNumeric.ones(Float32, N)
 
 # task = cuNumeric.@cuda_task kernel_add(a, b, c, UInt32(1))
diff --git a/src/ndarray/binary.jl b/src/ndarray/binary.jl
index 10974206..a8715240 100644
--- a/src/ndarray/binary.jl
+++ b/src/ndarray/binary.jl
@@ -87,7 +87,7 @@ function Base.:(-)(rhs1::NDArray{A,N}, rhs2::NDArray{B,N}) where {A,B,N}
     promote_shape(size(rhs1), size(rhs2))
     T_OUT = __checked_promote_op(-, A, B)
     out = cuNumeric.zeros(T_OUT, size(rhs1))
-    return nda_binary_op(
+    return nda_binary_op!(
         out,
         cuNumeric.SUBTRACT,
         unchecked_promote_arr(rhs1, T_OUT),
@@ -100,7 +100,7 @@ function Base.:(+)(rhs1::NDArray{A,N}, rhs2::NDArray{B,N}) where {A,B,N}
     promote_shape(size(rhs1), size(rhs2))
     T_OUT = __checked_promote_op(+, A, B)
     out = cuNumeric.zeros(T_OUT, size(rhs1))
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.ADD, unchecked_promote_arr(rhs1, T_OUT), unchecked_promote_arr(rhs2, T_OUT)
     )
 end
@@ -108,7 +108,7 @@ end
 function Base.:(*)(val::V, arr::NDArray{A}) where {A,V}
     T = __my_promote_type(A, V)
     out = cuNumeric.zeros(T, size(arr))
-    return nda_binary_op(out, cuNumeric.MULTIPLY, NDArray(T(val)), unchecked_promote_arr(arr, T))
+    return nda_binary_op!(out, cuNumeric.MULTIPLY, NDArray(T(val)), unchecked_promote_arr(arr, T))
 end
 
 function Base.:(*)(arr::NDArray{A}, val::V) where {A,V}
@@ -191,7 +191,7 @@ for (julia_fn, op_code) in binary_op_map
         @inline function __broadcast(
             f::typeof($(julia_fn)), out::NDArray, rhs1::NDArray{T}, rhs2::NDArray{T}
         ) where {T}
-            return nda_binary_op(out, $(op_code), rhs1, rhs2)
+            return nda_binary_op!(out, $(op_code), rhs1, rhs2)
         end
     end
 end
@@ -204,7 +204,7 @@ for (julia_fn, op_code) in floaty_binary_op_map
         @inline function __broadcast(
             f::typeof($(julia_fn)), out::NDArray, rhs1::NDArray{T}, rhs2::NDArray{T}
         ) where {T}
-            return nda_binary_op(out, $(op_code), rhs1, rhs2)
+            return nda_binary_op!(out, $(op_code), rhs1, rhs2)
         end
 
         # If input is not already float, promote to that
@@ -220,7 +220,7 @@ end
     f::typeof(Base.:(+)), out::NDArray{O}, rhs1::NDArray{Bool}, rhs2::NDArray{Bool}
 ) where {O<:Integer}
     assertpromotion(".+", Bool, O)
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.ADD, unchecked_promote_arr(rhs1, O), unchecked_promote_arr(rhs2, O)
     )
 end
@@ -229,7 +229,7 @@ end
     f::typeof(Base.:(-)), out::NDArray{O}, rhs1::NDArray{Bool}, rhs2::NDArray{Bool}
 ) where {O<:Integer}
     assertpromotion(".-", Bool, O)
-    return nda_binary_op(
+    return nda_binary_op!(
         out, cuNumeric.SUBTRACT, unchecked_promote_arr(rhs1, O), unchecked_promote_arr(rhs2, O)
     )
 end
@@ -250,7 +250,7 @@ end
 @inline function __broadcast(
     f::typeof(Base.literal_pow), out::NDArray, _, input::NDArray{T}, power::NDArray{T}
 ) where {T}
-    return nda_binary_op(out, cuNumeric.POWER, input, power)
+    return nda_binary_op!(out, cuNumeric.POWER, input, power)
 end
 
 # This is more "Julian" since a user expects map to broadcast
diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 5ecc10ea..7d0c8c2b 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -28,21 +28,19 @@ get_n_dim(ptr::NDArray_t) = Int(ccall((:nda_array_dim, libnda), Int32, (NDArray_
 abstract type AbstractNDArray{T<:SUPPORTED_TYPES,N} end
 
 @doc"""
-**Internal API**
-
 The NDArray type represents a multi-dimensional array in cuNumeric.
 It is a wrapper around a Legate array and provides various methods for array manipulation and operations. 
 Finalizer calls `nda_destroy_array` to clean up the underlying Legate array when the NDArray is garbage collected.
 """
-mutable struct NDArray{T,N} <: AbstractNDArray{T,N}
+mutable struct NDArray{T, N, PADDED} <: AbstractNDArray{T,N}
     ptr::NDArray_t
     nbytes::Int64
-    padding::Union{Nothing,NTuple{N,Int}} where {N}
+    padding::Union{Nothing,NTuple{N,Int}}
 
-    function NDArray(ptr::NDArray_t; T=get_julia_type(ptr), n_dim=get_n_dim(ptr))
+    function NDArray(ptr::NDArray_t, ::Type{T}, ::Val{N}) where {T, N}
         nbytes = cuNumeric.nda_nbytes(ptr)
         cuNumeric.register_alloc!(nbytes)
-        handle = new{T,Int(n_dim)}(ptr, nbytes, nothing)
+        handle = new{T,N, false}(ptr, nbytes, nothing)
         finalizer(handle) do h
             cuNumeric.nda_destroy_array(h.ptr)
             cuNumeric.register_free!(h.nbytes)
@@ -51,6 +49,9 @@ mutable struct NDArray{T,N} <: AbstractNDArray{T,N}
     end
 end
 
+# Dynamic fallback, not great but required if we cannot infer things
+NDArray(ptr::NDArray_t; T = get_julia_type(ptr), N::Integer = get_n_dim(ptr)) = NDArray(ptr, T, Val(N))
+
 # struct WrappedNDArray{T,N} <: AbstractNDArray{T,N}
 #     ndarr::NDArray{T,N}
 #     jlarr::Array{T,N}
@@ -77,28 +78,28 @@ end
 #     return NDArray(ptr, T = T, n_dim = 1)
 # end
 
-NDArray(value::T) where {T<:SUPPORTED_TYPES} = nda_full_array(UInt64[], value)
+NDArray(value::T) where {T<:SUPPORTED_TYPES} = nda_full_array((), value)
 
 # construction 
-function nda_zeros_array(shape::Vector{UInt64}, ::Type{T}) where {T}
-    n_dim = Int32(length(shape))
+function nda_zeros_array(dims::Dims{N}, ::Type{T}) where {T, N}
+    shape = collect(UInt64, dims)
     legate_type = Legate.to_legate_type(T)
     ptr = ccall((:nda_zeros_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated),
-        n_dim, shape, legate_type)
-    return NDArray(ptr; T=T, n_dim=n_dim)
+        Int32(N), shape, legate_type)
+    return NDArray(ptr, T, Val(N))
 end
 
-function nda_full_array(shape::Vector{UInt64}, value::T) where {T}
-    n_dim = Int32(length(shape))
+function nda_full_array(dims::Dims{N}, value::T) where {T, N}
+    shape = collect(UInt64, dims)
     type = Legate.to_legate_type(T)
 
     ptr = ccall((:nda_full_array, libnda),
         NDArray_t,
         (Int32, Ptr{UInt64}, Legate.LegateTypeAllocated, Ptr{Cvoid}),
-        n_dim, shape, type, Ref(value))
+        Int32(N), shape, type, Ref(value))
 
-    return NDArray(ptr; T=T, n_dim=n_dim)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_random(arr::NDArray, gen_code)
@@ -107,19 +108,19 @@ function nda_random(arr::NDArray, gen_code)
         arr.ptr, Int32(gen_code))
 end
 
-function nda_random_array(shape::Vector{UInt64})
-    n_dim = Int32(length(shape))
+function nda_random_array(dims::Dims{N}) where {N}
+    shape = collect(UInt64, dims)
     ptr = ccall((:nda_random_array, libnda),
         NDArray_t, (Int32, Ptr{UInt64}),
-        n_dim, shape)
-    return NDArray(ptr; n_dim=n_dim)
+        Int32(N), shape)
+    return NDArray(ptr, Float64, Val(N)) #* T is always Float64 cause of cupynumeric
 end
 
 function nda_get_slice(arr::NDArray{T,N}, slices::Vector{Slice}) where {T,N}
     ptr = ccall((:nda_get_slice, libnda),
         NDArray_t, (NDArray_t, Ptr{Slice}, Cint),
         arr.ptr, pointer(slices), length(slices))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 # queries
@@ -142,12 +143,12 @@ function nda_array_shape(arr::NDArray)
 end
 
 # modify
-function nda_reshape_array(arr::NDArray{T}, newshape::Vector{UInt64}) where {T}
-    n_dim = Int32(length(newshape))
+function nda_reshape_array(arr::NDArray{T}, newdims::Dims{N}) where {T, N}
+    newshape = collect(UInt64, newdims)
     ptr = ccall((:nda_reshape_array, libnda),
         NDArray_t, (NDArray_t, Int32, Ptr{UInt64}),
-        arr.ptr, n_dim, newshape)
-    return NDArray(ptr; T=T, n_dim=n_dim)
+        arr.ptr, Int32(N), newshape)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_astype(arr::NDArray{OLD_T,N}, ::Type{NEW_T}) where {OLD_T,NEW_T,N}
@@ -156,7 +157,7 @@ function nda_astype(arr::NDArray{OLD_T,N}, ::Type{NEW_T}) where {OLD_T,NEW_T,N}
         NDArray_t,
         (NDArray_t, Legate.LegateTypeAllocated),
         arr.ptr, type)
-    return NDArray(ptr; T=NEW_T, n_dim=N)
+    return NDArray(ptr, NEW_T, Val(N))
 end
 
 function nda_fill_array(arr::NDArray{T}, value::T) where {T}
@@ -193,14 +194,14 @@ function nda_move(dst::NDArray{T,N}, src::NDArray{T,N}) where {T,N}
 end
 
 # operations 
-function nda_binary_op(out::NDArray, op_code::BinaryOpCode, rhs1::NDArray, rhs2::NDArray)
+function nda_binary_op!(out::NDArray, op_code::BinaryOpCode, rhs1::NDArray, rhs2::NDArray)
     ccall((:nda_binary_op, libnda),
         Cvoid, (NDArray_t, BinaryOpCode, NDArray_t, NDArray_t),
         out.ptr, op_code, rhs1.ptr, rhs2.ptr)
     return out
 end
 
-function nda_unary_op(out::NDArray, op_code::UnaryOpCode, input::NDArray)
+function nda_unary_op!(out::NDArray, op_code::UnaryOpCode, input::NDArray)
     ccall((:nda_unary_op, libnda),
         Cvoid, (NDArray_t, UnaryOpCode, NDArray_t),
         out.ptr, op_code, input.ptr)
@@ -218,7 +219,7 @@ function nda_array_equal(rhs1::NDArray{T,N}, rhs2::NDArray{T,N}) where {T,N}
     ptr = ccall((:nda_array_equal, libnda),
         NDArray_t, (NDArray_t, NDArray_t),
         rhs1.ptr, rhs2.ptr)
-    return NDArray(ptr; T=Bool, n_dim=1)
+    return NDArray(ptr, Bool, Val(1))
 end
 
 function nda_diag(arr::NDArray, k::Int32)
@@ -255,7 +256,7 @@ function nda_multiply_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
     ptr = ccall((:nda_multiply_scalar, libnda),
         NDArray_t, (NDArray_t, Legate.LegateTypeAllocated, Ptr{Cvoid}),
         rhs1.ptr, type, Ref(value))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_add_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
@@ -264,7 +265,7 @@ function nda_add_scalar(rhs1::NDArray{T,N}, value::T) where {T,N}
     ptr = ccall((:nda_add_scalar, libnda),
         NDArray_t, (NDArray_t, Legate.LegateTypeAllocated, Ptr{Cvoid}),
         rhs1.ptr, type, Ref(value))
-    return NDArray(ptr; T=T, n_dim=N)
+    return NDArray(ptr, T, Val(N))
 end
 
 function nda_three_dot_arg(rhs1::NDArray{T}, rhs2::NDArray{T}, out::NDArray{T}) where {T}
@@ -286,7 +287,7 @@ function nda_eye(rows::Int32, ::Type{T}) where {T}
     ptr = ccall((:nda_eye, libnda),
         NDArray_t, (Int32, Legate.LegateTypeAllocated),
         rows, legate_type)
-    return NDArray(ptr; T=T, n_dim=2)
+    return NDArray(ptr, T, Val(2))
 end
 
 function nda_trace(
@@ -297,7 +298,7 @@ function nda_trace(
         NDArray_t,
         (NDArray_t, Int32, Int32, Int32, Legate.LegateTypeAllocated),
         arr.ptr, offset, a1, a2, legate_type)
-    return NDArray(ptr; T=T, n_dim=1)
+    return NDArray(ptr, T, Val(1))
 end
 
 function nda_transpose(arr::NDArray)
@@ -317,7 +318,7 @@ function nda_attach_external(arr::AbstractArray{T,N}) where {T,N}
         NDArray_t, (Ptr{Cvoid}, UInt64, Int32, Ptr{UInt64}, Legate.LegateTypeAllocated),
         ptr, nbytes, N, shape, legate_type)
 
-    return NDArray(nda_ptr; T=T, n_dim=N)
+    return NDArray(nda_ptr, T, Val(N))
 end
 
 # return underlying logical store to the NDArray obj
@@ -401,14 +402,6 @@ function slice_array(slices::Vararg{Tuple{Union{Int,Nothing},Union{Int,Nothing}}
     return v
 end
 
-@doc"""
-    padded_shape(arr::NDArray)
-
-**Internal API**
-
-Return the size of the given `NDArray`. This will include the padded size.
-"""
-padded_shape(arr::NDArray) = Tuple(Int.(cuNumeric.nda_array_shape(arr)))
 
 @doc"""
     shape(arr::NDArray)
@@ -417,11 +410,11 @@ padded_shape(arr::NDArray) = Tuple(Int.(cuNumeric.nda_array_shape(arr)))
 
 Return the size of the given `NDArray`.
 """
-function shape(arr::NDArray)
-    if !isnothing(arr.padding)
-        return arr.padding
-    end
-    return cuNumeric.padded_shape(arr)
+shape(arr::NDArray{<:Any, N, true}) where N = arr.padding
+
+function shape(arr::NDArray{<:Any, N, false}) where {N}
+    shp = cuNumeric.nda_array_shape(arr) 
+    return ntuple(i -> Int(shp[i]), Val(N))
 end
 
 @doc"""
diff --git a/src/ndarray/ndarray.jl b/src/ndarray/ndarray.jl
index 02ec68e5..b703ceb9 100644
--- a/src/ndarray/ndarray.jl
+++ b/src/ndarray/ndarray.jl
@@ -220,7 +220,7 @@ size(arr)
 size(arr, 2)
 ```
 """
-Base.size(arr::NDArray) = cuNumeric.shape(arr)
+Base.size(arr::NDArray{<:Any, N}) where N = cuNumeric.shape(arr)
 Base.size(arr::NDArray, dim::Int) = Base.size(arr)[dim]
 
 @doc"""
@@ -253,17 +253,21 @@ Base.IndexStyle(::NDArray) = IndexCartesian()
 
 function Base.show(io::IO, arr::NDArray{T,0}) where {T}
     println(io, "0-dimensional NDArray{$(T),0}")
-    print(io, arr[]) #! should I assert scalar??
+    allowscalar() do
+        print(io, arr[])
+    end
 end
 
 function Base.show(io::IO, ::MIME"text/plain", arr::NDArray{T,0}) where {T}
     println(io, "0-dimensional NDArray{$(T),0}")
-    print(io, arr[]) #! should I assert scalar??
+    allowscalar() do
+        print(io, arr[])
+    end
 end
 
 function Base.show(io::IO, arr::NDArray{T,D}) where {T,D}
     println(io, "NDArray{$(T),$(D)}")
-    Base.print_matrix(io, Array(arr))
+    Base.print_array(io, Array(arr))
 end
 
 function Base.show(io::IO, ::MIME"text/plain", arr::NDArray{T}) where {T}
@@ -304,7 +308,7 @@ Assignment also supports:
 
 # Examples
 ```@repl
-A = cuNumeric.full((3, 3), 1.0);
+A = cuNumeric.fill(1.0, (3, 3));
 A[1, 2]
 A[1:2, 2:3] = cuNumeric.ones(2, 2);
 A[:, 1] = 5.0;
@@ -461,25 +465,27 @@ Base.fill!(arr::NDArray{T}, val::T) where {T} = nda_fill_array(arr, val)
 
 #### INITIALIZATION OF NDARRAYS ####
 @doc"""
-    cuNumeric.full(dims::Tuple, val)
-    cuNumeric.full(dim::Int, val)
+    cuNumeric.fill(val::T, dims::Dims)
+    cuNumeric.fill(val::T, dims::Int...)
 
 Create an `NDArray` filled with the scalar value `val`, with the shape specified by `dims`.
 
 # Examples
 ```@repl
-cuNumeric.full((2, 3), 7.5)
-cuNumeric.full(4, 0)
+cuNumeric.fill(7.5, (2, 3))
+cuNumeric.fill(0, 4)
 ```
 """
-function full(dims::Dims, val::T) where {T<:SUPPORTED_TYPES}
-    shape = collect(UInt64, dims)
-    return nda_full_array(shape, val)
+function fill(val::T, dims::Dims) where {T<:SUPPORTED_TYPES}
+    return nda_full_array(dims, val)
 end
 
-function full(dim::Int, val::T) where {T<:SUPPORTED_TYPES}
-    shape = UInt64[dim]
-    return nda_full_array(shape, val)
+function fill(val::T, dims::Int...) where {T<:SUPPORTED_TYPES}
+    return fill(val, dims)
+end
+
+function fill(val::T, dim::Int) where {T<:SUPPORTED_TYPES}
+    return fill(val, (dim,))
 end
 
 @doc"""
@@ -494,9 +500,9 @@ Create an `NDArray` filled with the true, with the shape specified by `dims`.
 cuNumeric.trues(2, 3)
 ```
 """
-trues(dim::Int) = cuNumeric.full(dim, true)
-trues(dims::Dims) = cuNumeric.full(dims, true)
-trues(dims::Int...) = cuNumeric.full(dims, true)
+trues(dim::Int) = cuNumeric.fill(true, dim)
+trues(dims::Dims) = cuNumeric.fill(true, dims)
+trues(dims::Int...) = cuNumeric.fill(true, dims)
 
 @doc"""
     cuNumeric.falses(dims::Tuple, val)
@@ -510,9 +516,10 @@ Create an `NDArray` filled with the false, with the shape specified by `dims`.
 cuNumeric.falses(2, 3)
 ```
 """
-falses(dim::Int) = cuNumeric.full(dim, false)
-falses(dims::Dims) = cuNumeric.full(dims, false)
-falses(dims::Int...) = cuNumeric.full(dims, false)
+falses(dims::Dims) = cuNumeric.fill(false, dims)
+falses(dims::Int...) = cuNumeric.fill(false, dims)
+falses(dim::Int) = cuNumeric.fill(false, dim)
+
 
 @doc"""
     cuNumeric.zeros([T=Float32,] dims::Int...)
@@ -528,9 +535,8 @@ cuNumeric.zeros(Float64, 3)
 cuNumeric.zeros(Int32, (2,3))
 ```
 """
-function zeros(::Type{T}, dims::Dims) where {T<:SUPPORTED_TYPES}
-    shape = collect(UInt64, dims)
-    return nda_zeros_array(shape, T)
+function zeros(::Type{T}, dims::Dims{N}) where {T<:SUPPORTED_TYPES, N}
+    return nda_zeros_array(dims, T)
 end
 
 function zeros(::Type{T}, dims::Int...) where {T<:SUPPORTED_TYPES}
@@ -546,15 +552,15 @@ function zeros(dims::Int...)
 end
 
 function zeros(::Type{T}) where {T}
-    return nda_zeros_array(UInt64[], T)
+    return nda_zeros_array((), T)
 end
 
 function zeros()
     return zeros(DEFAULT_FLOAT)
 end
 
-function zeros_like(arr::NDArray)
-    return zeros(eltype(arr), Base.size(arr))
+function zeros_like(arr::NDArray{T,N}) where {T,N}
+    return zeros(T, Base.size(arr))
 end
 
 @doc"""
@@ -572,7 +578,7 @@ cuNumeric.ones(Int32, (2, 3))
 ```
 """
 function ones(::Type{T}, dims::Dims) where {T}
-    return full(dims, T(1))
+    return nda_full_array(dims, T(1))
 end
 
 function ones(::Type{T}, dims::Int...) where {T}
@@ -588,11 +594,11 @@ function ones(dims::Int...)
 end
 
 function ones(::Type{T}) where {T}
-    return full((), T(1))
+    return cuNumeric.fill(T(1), ())
 end
 
 function ones()
-    return zeros(DEFAULT_FLOAT)
+    return ones(DEFAULT_FLOAT)
 end
 
 @doc"""
@@ -617,14 +623,13 @@ A = cuNumeric.zeros(2, 2); cuNumeric.rand!(A)
 ```
 """
 Random.rand!(arr::NDArray{Float64}) = cuNumeric.nda_random(arr, 0)
-rand(::Type{NDArray}, dims::Dims) = cuNumeric.nda_random_array(UInt64.(collect(dims)))
+rand(::Type{NDArray}, dims::Dims) = cuNumeric.nda_random_array(dims)
 rand(::Type{NDArray}, dims::Int...) = cuNumeric.rand(NDArray, dims)
 rand(dims::Dims) = cuNumeric.rand(NDArray, dims)
 rand(dims::Int...) = cuNumeric.rand(NDArray, dims)
 
 function rand(::Type{T}, dims::Dims) where {T<:AbstractFloat}
-    arrfp64 = cuNumeric.nda_random_array(UInt64.(collect(dims)))
-    # if T == Float64, as_type should do minimial work # TODO check this.
+    arrfp64 = cuNumeric.nda_random_array(dims)
     return cuNumeric.as_type(arrfp64, T)
 end
 
@@ -645,14 +650,15 @@ reshape(arr, 12)
 ```
 """
 
+#*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
 function reshape(arr::NDArray, i::Dims{N}; copy::Bool=false) where {N}
-    reshaped = nda_reshape_array(arr, UInt64.(collect(i)))
+    reshaped = nda_reshape_array(arr, i)
     return copy ? copy(reshaped) : reshaped
 end
 
-function reshape(arr::NDArray, i::Int64; copy::Bool=false)
-    reshaped = nda_reshape_array(arr, UInt64.([i]))
-    return copy ? copy(reshaped) : reshaped
+#*USNTABLE USE Val{false} IF WE REALLY WANT THIS FLAG
+function reshape(arr::NDArray, i::Int...; copy::Bool=false)
+    return reshape(arr, i; copy = copy)
 end
 
 # Ignore the scalar indexing here...
diff --git a/src/ndarray/unary.jl b/src/ndarray/unary.jl
index 4158b010..3b12e73d 100644
--- a/src/ndarray/unary.jl
+++ b/src/ndarray/unary.jl
@@ -101,13 +101,13 @@ global const unary_op_map_no_args = Dict{Function,UnaryOpCode}(
 ### SPECIAL CASES ###
 
 # Needed to support != 
-Base.:(!)(input::NDArray{Bool,0}) = nda_unary_op(similar(input), cuNumeric.LOGICAL_NOT, input)
-Base.:(!)(input::NDArray{Bool,1}) = nda_unary_op(similar(input), cuNumeric.LOGICAL_NOT, input)
+Base.:(!)(input::NDArray{Bool,0}) = nda_unary_op!(similar(input), cuNumeric.LOGICAL_NOT, input)
+Base.:(!)(input::NDArray{Bool,1}) = nda_unary_op!(similar(input), cuNumeric.LOGICAL_NOT, input)
 
 # Non-broadcasted version of negation
 function Base.:(-)(input::NDArray{T}) where {T}
     out = cuNumeric.zeros(T, size(input))
-    return nda_unary_op(out, cuNumeric.NEGATIVE, input)
+    return nda_unary_op!(out, cuNumeric.NEGATIVE, input)
 end
 
 function Base.:(-)(input::NDArray{Bool})
@@ -121,7 +121,7 @@ end
 @inline function __broadcast(
     f::typeof(Base.literal_pow), out::NDArray{O}, _, input::NDArray{T}, ::Type{Val{2}}
 ) where {T,O}
-    return nda_unary_op(out, cuNumeric.SQUARE, input)
+    return nda_unary_op!(out, cuNumeric.SQUARE, input)
 end
 
 @inline function __broadcast(
@@ -129,13 +129,13 @@ end
 ) where {O}
     nda_move(out, O(1) ./ checked_promote_arr(input, O)) #! REPLACE WITH RECIP ONCE FIXED
     return out
-    # return nda_unary_op(out, cuNumeric.RECIPROCAL, input)
+    # return nda_unary_op!(out, cuNumeric.RECIPROCAL, input)
 end
 
 @inline function __broadcast(::typeof(Base.inv), out::NDArray{O}, input::NDArray) where {O}
     nda_move(out, O(1) ./ checked_promote_arr(input, O)) #! REPLACE WITH RECIP ONCE FIXED
     return out
-    # return nda_unary_op(out, cuNumeric.RECIPROCAL, checked_promote_arr(input,O))
+    # return nda_unary_op!(out, cuNumeric.RECIPROCAL, checked_promote_arr(input,O))
 end
 
 #! NEEDS TO SUPPORT inv and ^ -1
@@ -150,7 +150,7 @@ end
 
 # Only supported for Bools
 @inline function __broadcast(f::typeof(Base.:(!)), out::NDArray{Bool}, input::NDArray{Bool})
-    return nda_unary_op(out, cuNumeric.LOGICAL_NOT, input)
+    return nda_unary_op!(out, cuNumeric.LOGICAL_NOT, input)
 end
 
 # Generate hidden broadcasted version of unary ops.
@@ -159,7 +159,7 @@ for (julia_fn, op_code) in unary_op_map_no_args
         @inline function __broadcast(
             f::typeof($julia_fn), out::NDArray{T}, input::NDArray{T}
         ) where {T}
-            return nda_unary_op(out, $(op_code), input)
+            return nda_unary_op!(out, $(op_code), input)
         end
     end
 end
@@ -172,7 +172,7 @@ for (julia_fn, op_code) in floaty_unary_ops_no_args
         @inline function __broadcast(
             f::typeof($julia_fn), out::NDArray{T}, input::NDArray{T}
         ) where {T}
-            return nda_unary_op(out, $(op_code), input)
+            return nda_unary_op!(out, $(op_code), input)
         end
 
         # If input is not already float, promote to that
diff --git a/test/runtests.jl b/test/runtests.jl
index 60137c4c..4df77506 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -380,6 +380,10 @@ end
     end
 end
 
+@testset verbose = true "Type Stability" begin
+    include("tests/stability.jl")
+end
+
 @testset verbose = true "Scoping" begin
     N = 100
 
diff --git a/test/tests/stability.jl b/test/tests/stability.jl
new file mode 100644
index 00000000..d097454a
--- /dev/null
+++ b/test/tests/stability.jl
@@ -0,0 +1,77 @@
+@testset verbose = true "core" begin
+    a = cuNumeric.zeros(5)
+    b = cuNumeric.zeros(Float64, 3, 4)
+    @inferred size(a)
+    @inferred size(b)
+    @inferred cuNumeric.shape(a)
+    @inferred cuNumeric.shape(b)
+end
+
+@testset verbose = true "construction" begin
+    # zeros, zeros_like, ones, rand, fill, trues, falses\
+    for constructor in (:zeros, :ones)
+        @eval begin
+            @inferred cuNumeric.$(constructor)(Float64, 3, 2)
+            @inferred cuNumeric.$(constructor)(Float64, (3, 4))
+            @inferred cuNumeric.$(constructor)(3, 5, 6)
+            @inferred cuNumeric.$(constructor)((3,))
+            @inferred cuNumeric.$(constructor)()
+            @inferred cuNumeric.$(constructor)(Int64)
+        end
+    end
+    a = cuNumeric.zeros(Float64, 5, 3)
+    @inferred cuNumeric.zeros_like(a)
+
+    for constructor in (:trues, :falses)
+        @eval begin
+            @inferred cuNumeric.$(constructor)(5)
+            @inferred cuNumeric.$(constructor)((5,4))
+            @inferred cuNumeric.$(constructor)(3, 4, 5)
+        end
+    end
+
+    @inferred cuNumeric.fill(2.0, 3, 4)
+    @inferred cuNumeric.fill(2, (3, 4))
+    @inferred cuNumeric.fill(2.0, 3)
+
+    @inferred cuNumeric.rand(4, 3)
+    @inferred cuNumeric.rand(Float32, 5)
+end
+
+@testset verbose = true "conversion" begin
+    # cast to array, as_type
+    a = cuNumeric.zeros(Float64, 5, 5)
+    @inferred Array(a)
+    @inferred Array{Float32}(a)
+    @inferred cuNumeric.as_type(a, Float32)
+    @inferred cuNumeric.as_type(a, Int64)
+end
+
+@testset verbose = true "indexing" begin
+    # getindex, setindex!, copy, copyto!, fill!, as_type
+    a = cuNumeric.zeros(Float32, 5, 5)
+    b = cuNumeric.zeros(Int32, 11)
+
+    @inferred a[1:3, 1:3]
+    @inferred a[2, 1:3]
+    @inferred a[1, 1:3] .+ b[1:3]
+    @inferred b[1:5]
+    # @inferred a[1:3, 1:end]
+    allowscalar() do
+        @inferred a[1, 2]
+    end
+end
+
+@testset verbose = true "broadcasting" begin
+    a = cuNumeric.ones(Float32, 3, 3)
+    b = cuNumeric.ones(Int32, 3, 3)
+    @inferred 5 .* a
+    @inferred 5.0f0 .* a
+    @inferred 5 * a
+    @inferred 5.0f0 * a
+
+    @inferred a .* b
+    @inferred a .+ b
+    @inferred a ./ b
+    @inferred ((a .* b) .+ a) .* 2.0f0
+end
\ No newline at end of file