Skip to content

Conversation

@bharatr21
Copy link

@bharatr21 bharatr21 commented Dec 1, 2025

Description

closes #1282

Replace isinstance(obj, T) checks with type(obj) is T to optimize cuda.core.launch()


Additional Notes


I made a benchmarking script in Cython to prove the speedup of using type() in place of isinstance() checks since the original issue requested profiling which resulted in an ~5x speedup.
Appreciate some guidance to know if I've done the profiling right

Created a file benchmark_isinstance_cython.pyx :

from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.stdint cimport (intptr_t,
                         int8_t, int16_t, int32_t, int64_t,
                         uint8_t, uint16_t, uint32_t, uint64_t)
from libcpp cimport bool as cpp_bool
from libcpp.complex cimport complex as cpp_complex
from libcpp.vector cimport vector

import ctypes
import numpy
import time
from statistics import mean, stdev


ctypedef cpp_complex.complex[float] cpp_single_complex
ctypedef cpp_complex.complex[double] cpp_double_complex

# Cache type objects
cdef object ctypes_bool = ctypes.c_bool
cdef object ctypes_int8 = ctypes.c_int8
cdef object ctypes_int16 = ctypes.c_int16
cdef object ctypes_int32 = ctypes.c_int32
cdef object ctypes_int64 = ctypes.c_int64
cdef object ctypes_uint8 = ctypes.c_uint8
cdef object ctypes_uint16 = ctypes.c_uint16
cdef object ctypes_uint32 = ctypes.c_uint32
cdef object ctypes_uint64 = ctypes.c_uint64
cdef object ctypes_float = ctypes.c_float
cdef object ctypes_double = ctypes.c_double
cdef object numpy_bool = numpy.bool_
cdef object numpy_int8 = numpy.int8
cdef object numpy_int16 = numpy.int16
cdef object numpy_int32 = numpy.int32
cdef object numpy_int64 = numpy.int64
cdef object numpy_uint8 = numpy.uint8
cdef object numpy_uint16 = numpy.uint16
cdef object numpy_uint32 = numpy.uint32
cdef object numpy_uint64 = numpy.uint64
cdef object numpy_float16 = numpy.float16
cdef object numpy_float32 = numpy.float32
cdef object numpy_float64 = numpy.float64
cdef object numpy_complex64 = numpy.complex64
cdef object numpy_complex128 = numpy.complex128

# Limitation due to cython/cython#534
ctypedef void* voidptr


# ============================================================================
# Version 1: Current implementation using isinstance()
# ============================================================================

cdef inline int prepare_ctypes_arg_isinstance(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr

   if isinstance(arg, ctypes_bool):
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int8):
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int16):
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int32):
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int64):
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint8):
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint16):
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint32):
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint64):
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_float):
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_double):
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


cdef inline int prepare_numpy_arg_isinstance(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr

   if isinstance(arg, numpy_bool):
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int8):
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int16):
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int32):
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int64):
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint8):
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint16):
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint32):
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint64):
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_float32):
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_float64):
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_complex64):
       ptr = PyMem_Malloc(sizeof(cpp_single_complex))
       (<cpp_single_complex*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_complex128):
       ptr = PyMem_Malloc(sizeof(cpp_double_complex))
       (<cpp_double_complex*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


# ============================================================================
# Version 2: Optimized implementation using type() is
# ============================================================================

cdef inline int prepare_ctypes_arg_type_is(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr
   cdef object arg_type = type(arg)

   if arg_type is ctypes_bool:
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int8:
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int16:
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int32:
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int64:
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint8:
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint16:
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint32:
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint64:
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_float:
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_double:
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


cdef inline int prepare_numpy_arg_type_is(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr
   cdef object arg_type = type(arg)

   if arg_type is numpy_bool:
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int8:
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int16:
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int32:
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int64:
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint8:
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint16:
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint32:
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint64:
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_float32:
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_float64:
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_complex64:
       ptr = PyMem_Malloc(sizeof(cpp_single_complex))
       (<cpp_single_complex*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_complex128:
       ptr = PyMem_Malloc(sizeof(cpp_double_complex))
       (<cpp_double_complex*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


# ============================================================================
# Benchmark functions
# ============================================================================

def benchmark_isinstance(kernel_args, int iterations):
   """Benchmark the isinstance() approach."""
   cdef size_t n_args = len(kernel_args)
   cdef size_t i, j
   cdef int not_prepared
   cdef vector[voidptr] data
   cdef vector[voidptr] data_addresses
   cdef double start, end

   # Warmup
   for _ in range(100):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)
       for i, arg in enumerate(kernel_args):
           if isinstance(arg, int):
               continue
           elif isinstance(arg, float):
               continue
           elif isinstance(arg, complex):
               continue
           elif isinstance(arg, bool):
               continue

           not_prepared = prepare_numpy_arg_isinstance(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_isinstance(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   # Actual benchmark
   start = time.perf_counter()
   for j in range(iterations):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)

       for i, arg in enumerate(kernel_args):
           if isinstance(arg, int):
               continue
           elif isinstance(arg, float):
               continue
           elif isinstance(arg, complex):
               continue
           elif isinstance(arg, bool):
               continue

           not_prepared = prepare_numpy_arg_isinstance(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_isinstance(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   end = time.perf_counter()
   return end - start


def benchmark_type_is(kernel_args, int iterations):
   """Benchmark the type() is approach."""
   cdef size_t n_args = len(kernel_args)
   cdef size_t i, j
   cdef int not_prepared
   cdef vector[voidptr] data
   cdef vector[voidptr] data_addresses
   cdef double start, end
   cdef object arg_type

   # Warmup
   for _ in range(100):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)
       for i, arg in enumerate(kernel_args):
           arg_type = type(arg)
           if arg_type is int:
               continue
           elif arg_type is float:
               continue
           elif arg_type is complex:
               continue
           elif arg_type is bool:
               continue

           not_prepared = prepare_numpy_arg_type_is(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_type_is(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   # Actual benchmark
   start = time.perf_counter()
   for j in range(iterations):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)

       for i, arg in enumerate(kernel_args):
           arg_type = type(arg)
           if arg_type is int:
               continue
           elif arg_type is float:
               continue
           elif arg_type is complex:
               continue
           elif arg_type is bool:
               continue

           not_prepared = prepare_numpy_arg_type_is(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_type_is(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   end = time.perf_counter()
   return end - start


def run_benchmark():
   """Main benchmark runner."""
   print("=" * 70)
   print("Cython Benchmark: isinstance() vs type() is")
   print("Kernel Argument Handling Hot Path")
   print("=" * 70)
   print()

   # Create realistic kernel arguments
   kernel_args = [
       numpy.int32(100),
       numpy.float32(2.5),
       numpy.float64(1.23),
       numpy.complex64(1+1j),
       numpy.int64(999),
       numpy.uint32(255),
       ctypes.c_int32(50),
       ctypes.c_float(1.5),
       ctypes.c_double(2.7),
       numpy.int8(10),
       numpy.int16(20),
       numpy.uint8(5),
       numpy.uint16(30),
       numpy.float32(0.5),
   ]

   iterations = 50000
   num_runs = 10

   print(f"Configuration:")
   print(f"  - Arguments per launch: {len(kernel_args)}")
   print(f"  - Simulated launches per run: {iterations:,}")
   print(f"  - Number of runs: {num_runs}")
   print(f"  - Total argument processing: {len(kernel_args) * iterations * num_runs:,}")
   print()

   print("Running isinstance() benchmark...")
   isinstance_times = []
   for i in range(num_runs):
       t = benchmark_isinstance(kernel_args, iterations)
       isinstance_times.append(t)
       print(f"  Run {i+1}: {t:.4f}s")
   isinstance_mean = mean(isinstance_times)
   isinstance_stdev = stdev(isinstance_times)
   print(f"  Mean: {isinstance_mean:.4f}s ± {isinstance_stdev:.6f}s")
   print()

   print("Running type() is benchmark...")
   type_is_times = []
   for i in range(num_runs):
       t = benchmark_type_is(kernel_args, iterations)
       type_is_times.append(t)
       print(f"  Run {i+1}: {t:.4f}s")
   type_is_mean = mean(type_is_times)
   type_is_stdev = stdev(type_is_times)
   print(f"  Mean: {type_is_mean:.4f}s ± {type_is_stdev:.6f}s")
   print()

   print("=" * 70)
   print("RESULTS")
   print("=" * 70)
   print(f"isinstance():  {isinstance_mean:.4f}s ± {isinstance_stdev:.6f}s")
   print(f"type() is:     {type_is_mean:.4f}s ± {type_is_stdev:.6f}s")
   print()

   speedup = isinstance_mean / type_is_mean
   time_saved = isinstance_mean - type_is_mean
   percent_faster = (speedup - 1) * 100

   if speedup > 1.02:
       print(f"✓ type() is is {speedup:.2f}x FASTER ({percent_faster:.1f}% improvement)")
       print(f"  Time saved per 1M launches: {time_saved / (iterations * num_runs) * 1e6 * 1000:.2f}ms")
       print()
       print("RECOMMENDATION: Replace isinstance() with type() is")
   elif speedup < 0.98:
       print(f"⚠️  isinstance() is {1/speedup:.2f}x FASTER")
       print()
       print("RECOMMENDATION: Keep using isinstance()")
   else:
       print(f"≈ Performance is similar (difference < 2%)")
       print()
       print("RECOMMENDATION: Keep using isinstance() for clarity")
   print()

   # Per-launch cost
   total_launches = iterations * num_runs
   print(f"Per-launch argument processing cost:")
   print(f"  isinstance(): {isinstance_mean / total_launches * 1e6:.2f} µs")
   print(f"  type() is:    {type_is_mean / total_launches * 1e6:.2f} µs")
   print()

I mainly used the compiler flags -O3 and -march=native and compiled and ran the above benchmark via this setup script setup_benchmark.py:

#!/usr/bin/env python3
"""
Setup script for building the Cython benchmark extension.
"""

from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy

extensions = [
   Extension(
       "benchmark_isinstance_cython",
       ["benchmark_isinstance_cython.pyx"],
       include_dirs=[numpy.get_include()],
       extra_compile_args=["-O3", "-march=native"],
       language="c++",
   )
]

setup(
   name="benchmark_isinstance_cython",
   ext_modules=cythonize(
       extensions,
       compiler_directives={
           'language_level': 3,
           'boundscheck': False,
           'wraparound': False,
           'cdivision': True,
       }
   ),
)

The script was then run with python setup_benchmark.py build_ext --inplace

Checklist

  • New or existing tests cover these changes.
  • The documentation is up to date with these changes.

@copy-pr-bot
Copy link
Contributor

copy-pr-bot bot commented Dec 1, 2025

This pull request requires additional validation before any workflows can run on NVIDIA's runners.

Pull request vetters can view their responsibilities here.

Contributors can view more details about this message here.

@bharatr21
Copy link
Author

/ok to test

@leofang leofang requested a review from mdboom December 1, 2025 15:49
@leofang leofang added enhancement Any code-related improvements P1 Medium priority - Should do cuda.core Everything related to the cuda.core module labels Dec 1, 2025
@leofang leofang added this to the cuda.core beta 10 milestone Dec 1, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

cuda.core Everything related to the cuda.core module enhancement Any code-related improvements P1 Medium priority - Should do

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Replace isinstance checks by type == in ParamHolder

2 participants