Add a brief description for each of the examples

shwina · shwina · commit f161652e564d · 2025-06-26T07:49:03.000-04:00
diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate a couple takeaways:
+# This demo illustrates:
 #
 #   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
 #   2. That linking allows for libraries to modify workflows dynamically at runtime
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
@@ -2,8 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-## Usage: pip install "cuda-core[cu12]"
-## python python_example.py
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile a CUDA kernel
+# and launch it using PyTorch tensors as inputs.
+#
+# ## Usage: pip install "cuda-core[cu12]"
+# ## python pytorch_example.py
+#
+# ################################################################################
+
 import sys
 
 import torch
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
@@ -2,6 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel
+# and launch it using `cupy` arrays as inputs. This is a simple example of a
+# templated kernel, where the kernel is instantiated for both `float` and `double`
+# data types.
+#
+# ################################################################################
+
 import sys
 
 import cupy as cp
@@ -32,6 +41,10 @@
 arch = "".join(f"{i}" for i in dev.compute_capability)
 program_options = ProgramOptions(std="c++11", arch=f"sm_{arch}")
 prog = Program(code, code_type="c++", options=program_options)
+
+# Note the use of the `name_expressions` argument to specify the template
+# instantiations of the kernel that we will use. For non-templated kernels,
+# `name_expressions` will simply contain the name of the kernels.
 mod = prog.compile(
     "cubin",
     logs=sys.stdout,
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to show the properties of the
+# CUDA devices in the system.
+#
+# ################################################################################
+
 import sys
 
 from cuda.core.experimental import Device, system
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile and launch kernels
+# on multiple GPUs.
+#
+# ################################################################################
+
 import sys
 
 import cupy as cp
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
@@ -4,14 +4,13 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate two takeaways:
+# This demo illustrates:
 #
 #   1. The similarity between CPU and GPU JIT-compilation with C++ sources
 #   2. How to use StridedMemoryView to interface with foreign C/C++ functions
 #
-# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
-# path, which can be easily installed from pip or conda following their instructions.
-# We also use NumPy/CuPy as the CPU/GPU array container.
+# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
+# easily installed from pip or conda following their instructions.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
@@ -4,14 +4,13 @@
 
 # ################################################################################
 #
-# This demo aims to illustrate two takeaways:
+# This demo illustrates:
 #
 #   1. The similarity between CPU and GPU JIT-compilation with C++ sources
 #   2. How to use StridedMemoryView to interface with foreign C/C++ functions
 #
-# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
-# path, which can be easily installed from pip or conda following their instructions.
-# We also use NumPy/CuPy as the CPU/GPU array container.
+# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
+# easily installed from pip or conda following their instructions.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates the use of thread block clusters in the CUDA launch
+# configuration.
+#
+# ################################################################################
+
 import os
 import sys
 
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
@@ -2,6 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+# ################################################################################
+#
+# This demo illustrates how to use `cuda.core` to compile and launch a simple
+# vector addition kernel.
+#
+# ################################################################################
+
 import cupy as cp
 
 from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`
`5`	`5`	`# ################################################################################`
`6`	`6`	`#`
`7`		`-# This demo aims to illustrate a couple takeaways:`
	`7`	`+# This demo illustrates:`
`8`	`8`	`#`
`9`	`9`	`# 1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together`
`10`	`10`	`# 2. That linking allows for libraries to modify workflows dynamically at runtime`
Original file line number	Diff line number	Diff line change
`@@ -4,14 +4,13 @@`
`4`	`4`
`5`	`5`	`# ################################################################################`
`6`	`6`	`#`
`7`		`-# This demo aims to illustrate two takeaways:`
	`7`	`+# This demo illustrates:`
`8`	`8`	`#`
`9`	`9`	`# 1. The similarity between CPU and GPU JIT-compilation with C++ sources`
`10`	`10`	`# 2. How to use StridedMemoryView to interface with foreign C/C++ functions`
`11`	`11`	`#`
`12`		`-# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU`
`13`		`-# path, which can be easily installed from pip or conda following their instructions.`
`14`		`-# We also use NumPy/CuPy as the CPU/GPU array container.`
	`12`	`+# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be`
	`13`	`+# easily installed from pip or conda following their instructions.`
`15`	`14`	`#`
`16`	`15`	`# ################################################################################`
`17`	`16`