diff --git a/pyproject.toml b/pyproject.toml index db9059056..d10f84d66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,3 +72,6 @@ reportInvalidTypeForm = "warning" [tool.pytest.ini_options] pythonpath = ["."] +markers = [ + "memory_leak: Marks test as known to leak objects" +] diff --git a/slangpy/testing/helpers.py b/slangpy/testing/helpers.py index 8abd0f930..d1b32d9ee 100644 --- a/slangpy/testing/helpers.py +++ b/slangpy/testing/helpers.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +import gc import hashlib import sys from pathlib import Path @@ -26,7 +27,7 @@ LogLevel, NativeHandle, ) -from slangpy.types.buffer import NDBuffer +from slangpy.types.buffer import NDBuffer, get_lookup_module from slangpy.core.function import Function # Global variables for device isolation. If SELECTED_DEVICE_TYPES is None, no restriction. @@ -44,6 +45,9 @@ else: raise RuntimeError("Unsupported platform") +# If live object tracking is supported, enable leak tracking +LEAK_TRACKING_ENABLED = hasattr(spy.Object, "report_live_objects") + # Called from pytest plugin if 'device-types' argument is provided def set_device_types(device_types_str: Optional[str]) -> None: @@ -82,6 +86,59 @@ def set_device_types(device_types_str: Optional[str]) -> None: USED_TORCH_DEVICES: bool = False METAL_PARAMETER_BLOCK_SUPPORT: Optional[bool] = None +TRACKED_LIVE_OBJECTS: Optional[list[Any]] = None + +# Types to ignore when checking for leaked objects +# - The reflection types are created and cached per device when buffers are loaded, so are hard +# to identify as actual leaks. +# - CoopVec is created on demand within the device when the coopvec api is used, and so will appear +# as a leak for cached devices. +IGNORE_LIVE_OBJECT_TYPES = ["NativeSlangType", "TypeLayoutReflection", "TypeReflection", "CoopVec"] + + +def save_live_objects(): + if LEAK_TRACKING_ENABLED: + global TRACKED_LIVE_OBJECTS + TRACKED_LIVE_OBJECTS = spy.Object.report_live_objects(True) + + +def compare_and_save_live_objects(allowed_leaks: Optional[dict[str, int]] = None): + if LEAK_TRACKING_ENABLED: + while gc.collect() > 0: + pass + + # Make a copy of allowed_leaks so we don't modify the original dict + allowed_leaks = allowed_leaks.copy() if allowed_leaks else {} + + # Get current live objects and compare to previous captured list + global TRACKED_LIVE_OBJECTS + new = spy.Object.report_live_objects(True) + if TRACKED_LIVE_OBJECTS: + errors = [] + + # Build a lookup by address for fast comparison + current_by_address = {x["object"]: x for x in TRACKED_LIVE_OBJECTS} + + # Find any new objects, and build list of errors + for obj in new: + if obj["object"] not in current_by_address: + cn = obj["class_name"] + if not cn in IGNORE_LIVE_OBJECT_TYPES: + if cn in allowed_leaks: + if allowed_leaks[cn] > 0: + allowed_leaks[cn] -= 1 + continue + errors.append(obj) + + # If any errors, raise runtime error with all of them in + if len(errors) > 0: + msg = "\n".join([f" {e}" for e in errors]) + raise RuntimeError(f"Leaked objects detected:\n{msg}") + + # Store updated live objects list + TRACKED_LIVE_OBJECTS = new + + # Always dump stuff when testing spy.set_dump_generated_shaders(True) # spy.set_dump_slang_intermediates(True) @@ -104,6 +161,9 @@ def close_all_devices(): torch.cuda.synchronize() + # Clear device cache + DEVICE_CACHE.clear() + # Close all devices that were created during the tests. for device in Device.get_created_devices(): print(f"Closing device on shutdown {device.desc.label}") @@ -231,7 +291,15 @@ def get_device( ) if use_cache: + # Cache device DEVICE_CACHE[cache_key] = device + + # When leak tracking, init the slangpy loopup cache up front and save live + # objects so that we don't report cached device resources as leaks. + if LEAK_TRACKING_ENABLED: + get_lookup_module(device) + save_live_objects() + return device diff --git a/slangpy/testing/plugin.py b/slangpy/testing/plugin.py index 78387f89c..2281f48ec 100644 --- a/slangpy/testing/plugin.py +++ b/slangpy/testing/plugin.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +import gc import pytest import inspect from typing import Any @@ -12,6 +13,10 @@ should_skip_test_for_device, should_skip_non_device_test, SELECTED_DEVICE_TYPES, + DEVICE_CACHE, + save_live_objects, + compare_and_save_live_objects, + LEAK_TRACKING_ENABLED, ) @@ -38,6 +43,63 @@ def pytest_sessionstart(session: pytest.Session): set_device_types(device_types_option) +def check_live_objects(): + gc.collect() + gc.collect() + gc.collect() + + objs = spy.Object.report_live_objects(False) + + num_cache_devices = len(DEVICE_CACHE) + + # Estimate how many of these global types can exist based on number of cached devices. + # Most are 1-to-1, however slangpy can load an extra module per device for type lookups, + # which also results in the potential creation of a program layout per device. + max_expected_counts = { + "Logger": num_cache_devices, + "Device": num_cache_devices, + "HotReload": num_cache_devices, + "SlangSession": num_cache_devices, + "SlangModule": num_cache_devices * 2, + "SlangModuleData": num_cache_devices * 2, + "SlangSessionData": num_cache_devices, + "Fence": num_cache_devices, + "FileSystemWatcher": num_cache_devices, + "ProgramLayout": num_cache_devices, + "CoopVec": num_cache_devices, + } + + # Loggers are known to persist, and the type info is not strictly bounded, as + # type infos used by buffers in slangpy are cached per device. + ignore_classes = [ + "Logger", + "LoggerOutput", + "TypeReflection", + "TypeLayoutReflection", + "NativeSlangType", + ] + + actual_count_by_class_name = {} + for obj in objs: + class_name = obj["class_name"] + if class_name in actual_count_by_class_name: + actual_count_by_class_name[class_name] += 1 + else: + actual_count_by_class_name[class_name] = 1 + + for class_name, count in actual_count_by_class_name.items(): + if class_name in ignore_classes: + continue + if class_name in max_expected_counts: + if count > max_expected_counts[class_name]: + print( + f"Warning: {class_name} count mismatch (expected: {max_expected_counts[class_name]}, actual: {count})" + ) + else: + print(f"Warning: Unexpected {class_name} count (actual: {count})") + raise RuntimeError(f"Unexpected {class_name} count (actual: {count})") + + @pytest.hookimpl(trylast=True) def pytest_sessionfinish(session: pytest.Session, exitstatus: int): close_all_devices() @@ -83,3 +145,34 @@ def pytest_runtest_setup(item: Any) -> None: pytest.skip( f"Skipping non-device test (target devices: {', '.join(target_device_names)})" ) + + +@pytest.hookimpl(wrapper=True) +def pytest_pyfunc_call(pyfuncitem: pytest.Function): + + if LEAK_TRACKING_ENABLED: + # Check if leak tests enabled, and optionally read list of allowed leaks + leak_check = True + allowed_leaks = None + leaks_mem_marker = pyfuncitem.get_closest_marker("memory_leak") + if leaks_mem_marker != None: + if hasattr(leaks_mem_marker, "kwargs"): + allowed_leaks = leaks_mem_marker.kwargs.get("details", None) + leak_check = allowed_leaks != None + + # If checks enabled, save current live objects. + if leak_check: + save_live_objects() + + # If the outcome is an exception, will raise the exception. + res = yield + + if LEAK_TRACKING_ENABLED: + # If checks enabled, immediately close any left over devices, then + # check for leaked objects. + if leak_check: + close_leaked_devices() + compare_and_save_live_objects(allowed_leaks) + + # Return result + return res diff --git a/slangpy/tests/device/test_lifetimes.py b/slangpy/tests/device/test_lifetimes.py index 4ffa76dd5..f1de2312b 100644 --- a/slangpy/tests/device/test_lifetimes.py +++ b/slangpy/tests/device/test_lifetimes.py @@ -6,6 +6,7 @@ from slangpy.testing import helpers +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_create_and_destroy_device_via_del(device_type: spy.DeviceType): device = helpers.get_device(device_type, use_cache=False) @@ -13,6 +14,7 @@ def test_create_and_destroy_device_via_del(device_type: spy.DeviceType): del device +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_create_and_destroy_device_via_none(device_type: spy.DeviceType): device = helpers.get_device(device_type, use_cache=False) @@ -20,6 +22,7 @@ def test_create_and_destroy_device_via_none(device_type: spy.DeviceType): device = None +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_load_module_and_cleanup_in_order(device_type: spy.DeviceType): device = helpers.get_device(device_type, use_cache=False) @@ -39,6 +42,7 @@ def test_load_module_and_cleanup_in_order(device_type: spy.DeviceType): device = None +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_load_module_and_cleanup_in_reverse_order(device_type: spy.DeviceType): device = helpers.get_device(device_type, use_cache=False) diff --git a/slangpy/tests/device/test_print.py b/slangpy/tests/device/test_print.py index 95b7b02f2..e46d63104 100644 --- a/slangpy/tests/device/test_print.py +++ b/slangpy/tests/device/test_print.py @@ -7,6 +7,7 @@ from slangpy.testing import helpers +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_print(device_type: spy.DeviceType): device = spy.Device(type=device_type, enable_print=True, label=f"print-{device_type.name}") diff --git a/slangpy/tests/device/test_reflection.py b/slangpy/tests/device/test_reflection.py index 63ebd1ab5..47fefb303 100644 --- a/slangpy/tests/device/test_reflection.py +++ b/slangpy/tests/device/test_reflection.py @@ -1157,6 +1157,10 @@ def test_is_sub_type(test_id: str, device_type: spy.DeviceType): assert module.layout.is_sub_type(t, i) +@pytest.mark.memory_leak( + "Leaks a module", + details={"SlangModule": 1, "SlangModuleData": 2, "SlangSessionData": 1, "ProgramLayout": 1}, +) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_hot_reload_invalid(test_id: str, device_type: spy.DeviceType): device = helpers.get_device(type=device_type) diff --git a/slangpy/tests/device/test_torch_interop.py b/slangpy/tests/device/test_torch_interop.py index 414bddecc..6018abdf2 100644 --- a/slangpy/tests/device/test_torch_interop.py +++ b/slangpy/tests/device/test_torch_interop.py @@ -8,6 +8,7 @@ from slangpy.testing import helpers +@pytest.mark.memory_leak("Leaks a whole device!") @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_buffer_to_torch(device_type: spy.DeviceType): if device_type == spy.DeviceType.cuda: @@ -50,6 +51,7 @@ def test_buffer_to_torch(device_type: spy.DeviceType): ) +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_torch_interop(device_type: spy.DeviceType): if device_type == spy.DeviceType.cuda: diff --git a/slangpy/tests/slangpy_tests/test_modules.py b/slangpy/tests/slangpy_tests/test_modules.py index d7439614f..f80f06bd6 100644 --- a/slangpy/tests/slangpy_tests/test_modules.py +++ b/slangpy/tests/slangpy_tests/test_modules.py @@ -114,6 +114,10 @@ def test_call_mutable_func(device_type: DeviceType): assert np.allclose(data[:2], [0.05, 0.1]) +@pytest.mark.memory_leak( + "Leaks modules, probably issue with looking up types by name", + details={"ShaderProgram": 1, "ShaderProgramData": 1, "SlangModuleData": 2, "SlangModule": 2}, +) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_read_back_with_global_func(device_type: DeviceType): m = load_test_module(device_type) diff --git a/slangpy/tests/slangpy_tests/test_packed_arg.py b/slangpy/tests/slangpy_tests/test_packed_arg.py index 0cb6ca598..990a5361e 100644 --- a/slangpy/tests/slangpy_tests/test_packed_arg.py +++ b/slangpy/tests/slangpy_tests/test_packed_arg.py @@ -7,6 +7,7 @@ from slangpy.testing import helpers +@pytest.mark.memory_leak("Leaks call data cache", details={"NativeCallDataCache": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_simple_int_call(device_type: DeviceType): @@ -28,6 +29,7 @@ def test_simple_int_call(device_type: DeviceType): assert result == 42 +@pytest.mark.memory_leak("Leaks call data cache", details={"NativeCallDataCache": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_simple_struct_call(device_type: DeviceType): @@ -53,6 +55,7 @@ def test_simple_struct_call(device_type: DeviceType): assert result == 42 +@pytest.mark.memory_leak("Leaks call data cache", details={"NativeCallDataCache": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_vectorize_struct_array(device_type: DeviceType): @@ -88,6 +91,7 @@ def test_vectorize_struct_array(device_type: DeviceType): assert np.array_equal(results, np.array([2, 3, 4, 5], dtype=np.int32)) +@pytest.mark.memory_leak("Leaks call data cache", details={"NativeCallDataCache": 1}) @pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES) def test_vectorize_struct_with_tensor_array(device_type: DeviceType): if device_type == DeviceType.metal: diff --git a/slangpy/tests/slangpy_tests/test_textures.py b/slangpy/tests/slangpy_tests/test_textures.py index bc8da4692..84bdfc514 100644 --- a/slangpy/tests/slangpy_tests/test_textures.py +++ b/slangpy/tests/slangpy_tests/test_textures.py @@ -502,6 +502,10 @@ def texture_return_value_impl( assert np.allclose(result_np, data.squeeze()) +@pytest.mark.memory_leak( + "Leaks modules, probably issue with looking up types by name", + details={"ShaderProgram": 1, "ShaderProgramData": 1, "SlangModuleData": 2, "SlangModule": 2}, +) @pytest.mark.parametrize( "texel_name", ["uint8_t", "uint16_t", "int8_t", "int16_t", "float", "half", "uint"] ) @@ -516,6 +520,10 @@ def test_texture_return_value(device_type: DeviceType, texel_name: str, dims: in # This case checks for when the return type is the string "texture". # This checks a subset of the "test_texture_return_value" parameters. +@pytest.mark.memory_leak( + "Leaks modules, probably issue with looking up types by name", + details={"ShaderProgram": 1, "ShaderProgramData": 1, "SlangModuleData": 2, "SlangModule": 2}, +) @pytest.mark.parametrize("texel_name", ["float"]) @pytest.mark.parametrize("dims", [1, 2, 3]) @pytest.mark.parametrize("channels", [4]) diff --git a/slangpy/tests/slangpy_tests/test_torchbuffers.py b/slangpy/tests/slangpy_tests/test_torchbuffers.py index bd1e90bbd..c48c02738 100644 --- a/slangpy/tests/slangpy_tests/test_torchbuffers.py +++ b/slangpy/tests/slangpy_tests/test_torchbuffers.py @@ -163,6 +163,7 @@ def run_tensor_race_condition_tests( # Pytest for our most common default cuda-interop case, in which we've configured pytorch # and slangpy to share the same context and stream. +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", [spy.DeviceType.cuda]) def test_shared_context_and_stream(device_type: spy.DeviceType): assert ( @@ -174,12 +175,14 @@ def test_shared_context_and_stream(device_type: spy.DeviceType): # Pytest for none-shared context case, which appears to avoid race conditions through some level # of synchronization in the default streams of separate contexts. For now this has shown not # to cause race conditions, so testing for that behaviour. +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", [spy.DeviceType.cuda]) def test_non_shared_context(device_type: spy.DeviceType): assert run_tensor_race_condition_tests(share_context=False) == False # Pytest for known race condition case, where we use a custom stream in torch but not sharing it with slangpy. +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", [spy.DeviceType.cuda]) def test_custom_stream_no_share(device_type: spy.DeviceType): pytest.skip("Race condition doesn't reproduce reliably on CI machines of varying specs") @@ -190,6 +193,7 @@ def test_custom_stream_no_share(device_type: spy.DeviceType): # Pytest that removes the race condition by sharing the custom stream +@pytest.mark.memory_leak("Leaks logger", details={"Logger": 1}) @pytest.mark.parametrize("device_type", [spy.DeviceType.cuda]) def test_custom_stream_share(device_type: spy.DeviceType): assert ( diff --git a/src/sgl/core/object.cpp b/src/sgl/core/object.cpp index 8518e877a..7b531201a 100644 --- a/src/sgl/core/object.cpp +++ b/src/sgl/core/object.cpp @@ -117,26 +117,37 @@ PyObject* Object::self_py() const noexcept #if SGL_ENABLE_OBJECT_TRACKING -void Object::report_live_objects() +std::string LiveObjectInfo::to_string() +{ + return fmt::format( + "address={}, self_py={}, ref_count={}, class_name={}", + fmt::ptr(object), + self_py ? fmt::ptr(self_py) : "null", + ref_count, + class_name + ); +} + +std::vector Object::report_live_objects(bool log_to_tty) { std::lock_guard lock(s_tracked_objects_mutex); + std::vector res; if (!s_tracked_objects.empty()) { - fmt::println("Found {} live objects!", s_tracked_objects.size()); + if (log_to_tty) + fmt::println("Found {} live objects!", s_tracked_objects.size()); for (const Object* object : s_tracked_objects) { uint64_t ref_count = object->ref_count(); PyObject* self_py = object->self_py(); if (self_py) ref_count = object_ref_cnt_py(self_py); - fmt::println( - "Live object: {} self_py={} ref_count={} class_name=\"{}\"", - fmt::ptr(object), - self_py ? fmt::ptr(self_py) : "null", - ref_count, - object->class_name() - ); + LiveObjectInfo info{object, ref_count, self_py, object->class_name()}; + if (log_to_tty) + fmt::println("Live object: {}", info.to_string()); + res.push_back(info); object->report_refs(); } } + return res; } void Object::report_refs() const diff --git a/src/sgl/core/object.h b/src/sgl/core/object.h index 64e2296b6..bec4512de 100644 --- a/src/sgl/core/object.h +++ b/src/sgl/core/object.h @@ -48,6 +48,19 @@ static constexpr bool SGL_TRACK_ALL_REFS{false}; namespace sgl { +class Object; + +#if SGL_ENABLE_OBJECT_TRACKING +struct LiveObjectInfo { + const Object* object; + uint64_t ref_count; + PyObject* self_py; + const char* class_name; + + std::string to_string(); +}; +#endif + /** * \brief Object base class with intrusive reference counting * @@ -143,7 +156,7 @@ class SGL_API Object { #if SGL_ENABLE_OBJECT_TRACKING /// Reports current set of live objects. - static void report_live_objects(); + static std::vector report_live_objects(bool log_to_tty = true); /// Report references of this object. void report_refs() const; diff --git a/src/slangpy_ext/core/object.cpp b/src/slangpy_ext/core/object.cpp index ab5ab8d74..9e0940bd5 100644 --- a/src/slangpy_ext/core/object.cpp +++ b/src/slangpy_ext/core/object.cpp @@ -32,7 +32,27 @@ SGL_PY_EXPORT(core_object) "Base class for all reference counted objects." ) #if SGL_ENABLE_OBJECT_TRACKING - .def_static("report_live_objects", &Object::report_live_objects) + .def_static( + "report_live_objects", + [](bool log_to_tty = true) + { + // We want to avoid creating new live objects by reporting objects, so instead of + // creating bindings for LiveObjectInfo, convert each to a dictionary. + auto live_objects = Object::report_live_objects(log_to_tty); + nb::list result; + for (const auto& info : live_objects) { + nb::dict obj_dict; + obj_dict["object"] = reinterpret_cast(info.object); + obj_dict["ref_count"] = info.ref_count; + obj_dict["self_py"] = reinterpret_cast(info.self_py); + obj_dict["class_name"] = info.class_name; + result.append(obj_dict); + } + return result; + }, + "log_to_tty"_a = true, + "Returns a list of dictionaries containing information about live objects" + ) #endif .def("__repr__", &Object::to_string); }