Skip to content

Segfault on tutorial 3 #498

@JinkaiQiu

Description

@JinkaiQiu

There is sporatic segfault that could occur on tutorial 3.
Running script below (very similar to tutorial 3) that creates the environment many times will likely reproduce at least one seg fault.

import os
from pathlib import Path
import torch
import mediapy

# Set working directory to the base directory 'gpudrive'
working_dir = Path.cwd()
while working_dir.name != 'gpudrive':
    working_dir = working_dir.parent
    if working_dir == Path.home():
        raise FileNotFoundError("Base directory 'gpudrive' not found")
os.chdir(working_dir)

from gpudrive.env.config import EnvConfig
from gpudrive.env.env_torch import GPUDriveTorchEnv
from gpudrive.visualize.utils import img_from_fig
from gpudrive.env.dataset import SceneDataLoader

MAX_NUM_OBJECTS = 64  # Maximum number of objects in the scene we control
NUM_WORLDS = 2  # Number of parallel environments
UNIQUE_SCENES = 2 # Number of unique scenes

device = 'cpu' # for simplicity purposes in notebook we use cpu, note that the simulator is optimized for GPU so use cuda if possible

env_config = EnvConfig(
    steer_actions = torch.round(
        torch.linspace(-1.0, 1.0, 3), decimals=3),
    accel_actions = torch.round(
        torch.linspace(-3, 3, 3), decimals=3
    )
)

# Make dataloader
data_loader = SceneDataLoader(
    root="data/processed/examples", # Path to the dataset
    batch_size=NUM_WORLDS, # Batch size, you want this to be equal to the number of worlds (envs) so that every world receives a different scene
    dataset_size=UNIQUE_SCENES, # Total number of different scenes we want to use
    sample_with_replacement=False, 
    seed=42, 
    shuffle=True,   
)

for i in range(50):
    print(f"On iteration {i}")
    # Make environment
    env = GPUDriveTorchEnv(
        config=env_config,
        data_loader=data_loader,
        max_cont_agents=MAX_NUM_OBJECTS, # Maximum number of agents to control per scenario
        device=device,
    )
    # env.reset()  # Reset the environment to start a new episode
    print("Environment create complete ______")

GDB output:

Thread 698 "python" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fede9d81640 (LWP 3475700)]
_ZZN7madrona12StateManager12iterateQueryIJNS_4phys10broadphase3BVHEEZNS_9TaskGraph12iterateQueryINS_7ContextEPFvRS7_RS4_EJS4_EEEvRT_RNS_5QueryIJDpT1_EEEOT0_EUlDpRT_E_EEvjRKNSE_IJDpSL_EEESK_ENKUliSP_E_clIJPS4_EEEDaiSP_ (num_rows=<optimized out>, ptrs=<optimized out>, this=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/state.inl:427
427                 fn(ptrs[i] ...);
(gdb) bt
#0  _ZZN7madrona12StateManager12iterateQueryIJNS_4phys10broadphase3BVHEEZNS_9TaskGraph12iterateQueryINS_7ContextEPFvRS7_RS4_EJS4_EEEvRT_RNS_5QueryIJDpT1_EEEOT0_EUlDpRT_E_EEvjRKNSE_IJDpSL_EEESK_ENKUliSP_E_clIJPS4_EEEDaiSP_ (num_rows=<optimized out>, ptrs=<optimized out>, this=<optimized out>)
    at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/state.inl:427
#1  _ZN7madrona12StateManager21iterateArchetypesImplIJNS_4phys10broadphase3BVHEEZNS0_12iterateQueryIJS4_EZNS_9TaskGraph12iterateQueryINS_7ContextEPFvRS8_RS4_EJS4_EEEvRT_RNS_5QueryIJDpT1_EEEOT0_EUlDpRT_E_EEvjRKNSF_IJDpSM_EEESL_EUliSQ_E_JLj0EEEEvjST_SL_NSt6__mad116integer_sequenceIjJXspT1_EEEE (
    this=0x2ab45a00, query=..., world_id=<optimized out>, fn=...)
    at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/state.inl:413
#2  _ZN7madrona12StateManager17iterateArchetypesIJNS_4phys10broadphase3BVHEEZNS0_12iterateQueryIJS4_EZNS_9TaskGraph12iterateQueryINS_7ContextEPFvRS8_RS4_EJS4_EEEvRT_RNS_5QueryIJDpT1_EEEOT0_EUlDpRT_E_EEvjRKNSF_IJDpSM_EEESL_EUliSQ_E_EEvjST_SL_ (this=0x2ab45a00, query=..., 
    world_id=<optimized out>, fn=...) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/state.inl:388
#3  _ZN7madrona12StateManager12iterateQueryIJNS_4phys10broadphase3BVHEEZNS_9TaskGraph12iterateQueryINS_7ContextEPFvRS7_RS4_EJS4_EEEvRT_RNS_5QueryIJDpT1_EEEOT0_EUlDpRT_E_EEvjRKNSE_IJDpSL_EEESK_ (this=0x2ab45a00, query=..., world_id=<optimized out>, fn=...)
    at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/state.inl:424
#4  madrona::TaskGraph::iterateQuery<madrona::Context, void (*)(madrona::Context&, madrona::phys::broadphase::BVH&), madrona::phys::broadphase::BVH> (this=<optimized out>, ctx=..., query=..., fn=<optimized out>)
    at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/taskgraph.inl:17
--Type <RET> for more, q to quit, c to continue without paging--c
#5  madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>::run (this=<optimized out>, ctx_base=..., taskgraph=...) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/taskgraph_builder.inl:76
#6  std::__mad1::__invoke[abi:nn180100]<void (madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>::*)(madrona::Context&, madrona::TaskGraph&), madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>*, madrona::Context&, madrona::TaskGraph&, void>(void (madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>::*&&)(madrona::Context&, madrona::TaskGraph&), madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>*&&, madrona::Context&, madrona::TaskGraph&) (__args=..., __args=..., __f=<optimized out>, __a0=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/external/madrona-toolchain/bundled-toolchain/libcxx-noexcept/include/c++/v1/__type_traits/invoke.h:312
#7  std::__mad1::invoke[abi:nn180100]<void (madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>::*)(madrona::Context&, madrona::TaskGraph&), madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>*, madrona::Context&, madrona::TaskGraph&>(void (madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>::*&&)(madrona::Context&, madrona::TaskGraph&), madrona::ParallelForNode<madrona::Context, &madrona::phys::broadphase::updateBVHEntry, madrona::phys::broadphase::BVH>*&&, madrona::Context&, madrona::TaskGraph&) (__args=<optimized out>, __args=<optimized out>, __f=<optimized out>, __args=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/external/madrona-toolchain/bundled-toolchain/libcxx-noexcept/include/c++/v1/__functional/invoke.h:28
#8  _ZZN7madrona16TaskGraphBuilder9addNodeFnITnDaXadL_ZNS_15ParallelForNodeINS_7ContextEXadL_ZNS_4phys10broadphase14updateBVHEntryERS3_RNS5_3BVHEEEJS7_EE3runES6_RNS_9TaskGraphEEES9_EENS_15TaskGraphNodeIDENS0_11TypedDataIDIT0_EENS_4SpanIKSC_EENS_8OptionalISC_EEENKUlPNS_8NodeBaseEPS3_PSA_E_clESM_SN_SO_ (node_data=<optimized out>, ctx=<optimized out>, task_graph=<optimized out>, this=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/taskgraph_builder.inl:36
#9  _ZZN7madrona16TaskGraphBuilder9addNodeFnITnDaXadL_ZNS_15ParallelForNodeINS_7ContextEXadL_ZNS_4phys10broadphase14updateBVHEntryERS3_RNS5_3BVHEEEJS7_EE3runES6_RNS_9TaskGraphEEES9_EENS_15TaskGraphNodeIDENS0_11TypedDataIDIT0_EENS_4SpanIKSC_EENS_8OptionalISC_EEENUlPNS_8NodeBaseEPS3_PSA_E_8__invokeESM_SN_SO_ (node_data=<optimized out>, ctx=<optimized out>, task_graph=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/common/../../include/madrona/taskgraph_builder.inl:33
#10 0x00007ffef7f94d37 in madrona::TaskGraph::run (this=0x120c4548, ctx=0x9136d40) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/core/taskgraph.cpp:168
#11 0x00007ffef7efdded in madrona::ThreadPoolExecutor::Impl::workerThread (this=0x2ab458c0, worker_id=<optimized out>) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/mw/cpu_exec.cpp:224
#12 0x00007ffef7efdeca in madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0::operator()(madrona::ThreadPoolExecutor::Impl*, long) const (this=0x43777350, impl=0x3f10bf00, i=-1) at /home/jinkai.qiu/work/gpudrive/external/madrona/src/mw/cpu_exec.cpp:120
#13 std::__mad1::__invoke[abi:nn180100]<madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0, madrona::ThreadPoolExecutor::Impl*, long>(madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0&&, madrona::ThreadPoolExecutor::Impl*&&, long&&) (__f=..., __args=@0x43777360: 1, __args=@0x43777360: 1) at /home/jinkai.qiu/work/gpudrive/external/madrona/external/madrona-toolchain/bundled-toolchain/libcxx-noexcept/include/c++/v1/__type_traits/invoke.h:344
#14 std::__mad1::__thread_execute[abi:nn180100]<std::__mad1::unique_ptr<std::__mad1::__thread_struct, std::__mad1::default_delete<std::__mad1::__thread_struct> >, madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0, madrona::ThreadPoolExecutor::Impl*, long, 2ul, 3ul>(std::__mad1::tuple<std::__mad1::unique_ptr<std::__mad1::__thread_struct, std::__mad1::default_delete<std::__mad1::__thread_struct> >, madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0, madrona::ThreadPoolExecutor::Impl*, long>&, std::__mad1::__tuple_indices<2ul, 3ul>) (__t=...) at /home/jinkai.qiu/work/gpudrive/external/madrona/external/madrona-toolchain/bundled-toolchain/libcxx-noexcept/include/c++/v1/__thread/thread.h:193
#15 std::__mad1::__thread_proxy[abi:nn180100]<std::__mad1::tuple<std::__mad1::unique_ptr<std::__mad1::__thread_struct, std::__mad1::default_delete<std::__mad1::__thread_struct> >, madrona::ThreadPoolExecutor::Impl::make(madrona::ThreadPoolExecutor::Config const&)::$_0, madrona::ThreadPoolExecutor::Impl*, long> >(void*) (__vp=0x43777350) at /home/jinkai.qiu/work/gpudrive/external/madrona/external/madrona-toolchain/bundled-toolchain/libcxx-noexcept/include/c++/v1/__thread/thread.h:202
#16 0x00007ffff7c94ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#17 0x00007ffff7d26850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions