diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d99b6c5..8fdba41 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -6,6 +6,8 @@ on:
   pull_request:
     branches: [master]
 
+permissions: read-all
+
 jobs:
   lint:
     runs-on: ubuntu-latest
diff --git a/speakeasy/profiler.py b/speakeasy/profiler.py
index 5f800f6..7244cbb 100644
--- a/speakeasy/profiler.py
+++ b/speakeasy/profiler.py
@@ -122,6 +122,7 @@ def __init__(self):
         self.coverage: set[int] = set()
         self.memory_regions: list[dict[str, Any]] = []
         self.loaded_modules: list[dict[str, Any]] = []
+        self.init_regs: dict[int, int] = {}
 
     def get_api_count(self):
         """
diff --git a/speakeasy/version.py b/speakeasy/version.py
index 4b3c3ea..9b8d3ee 100644
--- a/speakeasy/version.py
+++ b/speakeasy/version.py
@@ -1 +1 @@
-__version__ = "2.0.0a1"
+__version__ = "2.0.0b1"
diff --git a/speakeasy/windows/win32.py b/speakeasy/windows/win32.py
index 3539d70..257a3ef 100644
--- a/speakeasy/windows/win32.py
+++ b/speakeasy/windows/win32.py
@@ -222,7 +222,8 @@ def prepare_module_for_emulation(self, module, all_entrypoints):
             self.stop()
             raise Win32EmuError("Module not found")
 
-        # Check if any TLS callbacks exist, these run before the module's entry point
+        runs = []
+
         tls = module.get_tls_callbacks()
         for i, cb_addr in enumerate(tls):
             base = module.base
@@ -232,6 +233,7 @@ def prepare_module_for_emulation(self, module, all_entrypoints):
                 run.type = f"tls_callback_{i}"
                 run.args = [base, DLL_PROCESS_ATTACH, 0]
                 self.add_run(run)
+                runs.append(run)
 
         ep = module.base + module.ep
 
@@ -241,20 +243,14 @@ def prepare_module_for_emulation(self, module, all_entrypoints):
         if not module.is_exe():
             run.args = [module.base, DLL_PROCESS_ATTACH, 0]
             run.type = "dll_entry.DLL_PROCESS_ATTACH"
-            container = self.init_container_process()
-            if container:
-                self.processes.append(container)
-                self.curr_process = container
         else:
             run.type = "module_entry"
             run.args = [self.mem_map(8, tag=f"emu.module_arg_{i}") for i in range(4)]
 
         self.add_run(run)
+        runs.append(run)
 
         if all_entrypoints:
-            # Only emulate a subset of all the exported functions
-            # There are some modules (such as the windows kernel) with
-            # thousands of exports
             exports = [k for k in module.get_exports()[:MAX_EXPORTS_TO_EMULATE]]
 
             if exports:
@@ -275,14 +271,11 @@ def prepare_module_for_emulation(self, module, all_entrypoints):
                         argc, argv = self.build_service_main_args("IPRIP", char_width=char_width)
                         run.args = [argc, argv]
                     else:
-                        # Here we set dummy args to pass into the export function
                         run.args = args
-                    # Store these runs and only queue them before the unload
-                    # routine this is because some exports may not be ready to
-                    # be called yet
                     self.add_run(run)
+                    runs.append(run)
 
-        return
+        return runs
 
     def run_module(self, module, all_entrypoints=False, emulate_children=False):
         """
@@ -291,59 +284,57 @@ def run_module(self, module, all_entrypoints=False, emulate_children=False):
         Arguments:
             module: Module to emulate
         """
-        self.prepare_module_for_emulation(module, all_entrypoints)
+        runs = self.prepare_module_for_emulation(module, all_entrypoints)
 
-        # Create an empty process object for the module if none is
-        # supplied, only do this for the main module
-        if len(self.processes) == 0:
+        if not module.is_exe():
+            container = self.init_container_process()
+            if container:
+                p = container
+            else:
+                p = objman.Process(self, path=module.emu_path, base=module.base, pe=module, cmdline=self.command_line)
+        else:
             p = objman.Process(self, path=module.emu_path, base=module.base, pe=module, cmdline=self.command_line)
-            self.curr_process = p
-            self.om.objects.update({p.address: p})  # type: ignore[union-attr]
-            mm = self.get_address_map(module.base)
-            if mm:
-                mm.process = self.curr_process
 
-        t = objman.Thread(self, stack_base=self.stack_base, stack_commit=module.stack_commit)
+        self.processes.append(p)
+        self.om.objects.update({p.address: p})  # type: ignore[union-attr]
+        mm = self.get_address_map(module.base)
+        if mm:
+            mm.process = p
 
+        t = objman.Thread(self, stack_base=self.stack_base, stack_commit=module.stack_commit)
         self.om.objects.update({t.address: t})  # type: ignore[union-attr]
-        self.curr_process.threads.append(t)  # type: ignore[union-attr]
-        self.curr_thread = t
-
-        if self.run_queue:
-            self.run_queue[0].thread = t
+        t.process = p
+        p.threads.append(t)
 
-        peb = self.alloc_peb(self.curr_process)
+        for r in runs:
+            r.process_context = p
+            r.thread = t
 
-        # Set the TEB
-        self.init_teb(t, peb)
-
-        # Begin emulation of main module
         self.start()
 
         if not emulate_children or len(self.child_processes) == 0:
             return
 
-        # Emulate any child processes
         while len(self.child_processes) > 0:
             child = self.child_processes.pop(0)
 
             child.pe = self.load_module(data=child.pe_data)
-            self.prepare_module_for_emulation(child.pe, all_entrypoints)
+            child_runs = self.prepare_module_for_emulation(child.pe, all_entrypoints)
 
             self.command_line = child.cmdline
+            child.base = child.pe.base
 
-            self.curr_process = child
-            self.curr_process.base = child.pe.base
-            self.curr_thread = child.threads[0]
+            self.processes.append(child)
 
-            self.om.objects.update({self.curr_thread.address: self.curr_thread})  # type: ignore[union-attr]
+            child_thread = child.threads[0]
+            self.om.objects.update({child_thread.address: child_thread})  # type: ignore[union-attr]
 
-            # PEB and TEB will be initialized when the next run happens
+            for r in child_runs:
+                r.process_context = child
+                r.thread = child_thread
 
             self.start()
 
-        return
-
     def _init_name(self, path, data=None):
         if not data:
             self.file_name = os.path.basename(path)
@@ -419,8 +410,7 @@ def run_shellcode(self, sc_addr, stack_commit=0x4000, offset=0):
         if not target:
             raise Win32EmuError("Invalid shellcode address")
 
-        self.stack_base, stack_addr = self.alloc_stack(stack_commit)
-        self.set_func_args(self.stack_base, self.return_hook, 0x7000)
+        self.stack_base, _ = self.alloc_stack(stack_commit)
 
         run = Run()
         run.type = "shellcode"
@@ -428,36 +418,29 @@ def run_shellcode(self, sc_addr, stack_commit=0x4000, offset=0):
         run.instr_cnt = 0
         args = [self.mem_map(1024, tag=f"emu.shellcode_arg_{i}", base=0x41420000 + i) for i in range(4)]
         run.args = args
+        run.init_regs = {_arch.X86_REG_ECX: 1024}
 
-        self.reg_write(_arch.X86_REG_ECX, 1024)
-
-        self.add_run(run)
-
-        # Create an empty process object for the shellcode if none is
-        # supplied
         container = self.init_container_process()
         if container:
             self.processes.append(container)
-            self.curr_process = container
+            p = container
         else:
             p = objman.Process(self)
             self.processes.append(p)
-            self.curr_process = p
 
+        self.om.objects.update({p.address: p})  # type: ignore[union-attr]
         mm = self.get_address_map(sc_addr)
         if mm:
-            mm.process = self.curr_process
+            mm.process = p
 
         t = objman.Thread(self, stack_base=self.stack_base, stack_commit=stack_commit)
         self.om.objects.update({t.address: t})  # type: ignore[union-attr]
-        self.curr_process.threads.append(t)
-
-        self.curr_thread = t
-
-        peb = self.alloc_peb(self.curr_process)
+        t.process = p
+        p.threads.append(t)
 
-        # Set the TEB
-        self.init_teb(t, peb)
+        run.process_context = p
+        run.thread = t
+        self.add_run(run)
 
         self.start()
 
@@ -575,6 +558,38 @@ def init_container_process(self):
                 return proc
         return None
 
+    def _create_default_process(self, run):
+        mod = self.get_module_from_addr(run.start_addr)
+
+        if mod and getattr(mod, "is_exe", lambda: False)():
+            p = objman.Process(
+                self,
+                path=getattr(mod, "emu_path", ""),
+                base=getattr(mod, "base", 0),
+                pe=mod,
+                cmdline=self.command_line,
+            )
+        else:
+            container = self.init_container_process()
+            if container:
+                p = container
+            elif mod:
+                p = objman.Process(
+                    self,
+                    path=getattr(mod, "emu_path", ""),
+                    base=getattr(mod, "base", 0),
+                    pe=mod,
+                )
+            else:
+                p = objman.Process(self)
+
+        self.processes.append(p)
+        self.om.objects.update({p.address: p})  # type: ignore[union-attr]
+        mm = self.get_address_map(run.start_addr)
+        if mm:
+            mm.process = p
+        return p
+
     def _init_user_modules_from_config(self):
         proc_mod = None
         for p in self.config.processes:
diff --git a/speakeasy/windows/winemu.py b/speakeasy/windows/winemu.py
index ea8f114..eac357d 100644
--- a/speakeasy/windows/winemu.py
+++ b/speakeasy/windows/winemu.py
@@ -403,7 +403,6 @@ def call(self, addr, params=[]):
         """
         Start emulating at the specified address
         """
-        self.reset_stack(self.stack_base)
         run = Run()
         run.type = f"call_0x{addr:x}"
         run.start_addr = addr
@@ -415,9 +414,50 @@ def call(self, addr, params=[]):
         else:
             self.add_run(run)
 
+    def _resolve_run_process(self, run):
+        if run.process_context:
+            return run.process_context
+        if run.thread and getattr(run.thread, "process", None):
+            return run.thread.process
+        if self.curr_process:
+            return self.curr_process
+        return self._create_default_process(run)
+
+    def _create_default_process(self, run):
+        p = objman.Process(self)
+        self.processes.append(p)
+        self.om.objects.update({p.address: p})  # type: ignore[union-attr]
+        return p
+
+    def _resolve_run_thread(self, run, proc):
+        if run.thread:
+            tp = getattr(run.thread, "process", None)
+            if tp is None:
+                run.thread.process = proc
+                if run.thread not in proc.threads:
+                    proc.threads.append(run.thread)
+            elif tp is not proc:
+                raise WindowsEmuError(
+                    f"Run thread is bound to a different process "
+                    f"(thread.process={tp!r}, resolved={proc!r})"
+                )
+            return run.thread
+        if self.kernel_mode:
+            return None
+        thread = objman.Thread(self, stack_base=self.stack_base)
+        self.om.objects.update({thread.address: thread})  # type: ignore[union-attr]
+        thread.process = proc
+        proc.threads.append(thread)
+        run.thread = thread
+        return thread
+
     def _prepare_run_context(self, run):
         """
         Prepare CPU and memory state for the given run without starting emulation.
+
+        This is the single canonical path for process/thread/PEB/TEB/TLS
+        activation. All run types (call, module entry, shellcode, thread)
+        converge here.
         """
         logger.info("* exec: %s", run.type)
 
@@ -430,30 +470,24 @@ def _prepare_run_context(self, run):
         stk_ptr = self.get_stack_ptr()
 
         self.set_func_args(stk_ptr, self.return_hook, *run.args)
+        for reg, val in run.init_regs.items():
+            self.reg_write(reg, val)
         stk_ptr = self.get_stack_ptr()
         stk_map = self.get_address_map(stk_ptr)
 
         self.curr_run.stack = MemAccess(base=stk_map.base, size=stk_map.size)
 
-        # Set the process context if possible
-        if run.process_context:
-            # Init a new peb if the process context changed:
-            if run.process_context != self.get_current_process():
-                self.alloc_peb(run.process_context)
-            self.set_current_process(run.process_context)
-        if run.thread:
-            self.set_current_thread(run.thread)
-        elif not self.kernel_mode:
-            thread = objman.Thread(self, stack_base=self.stack_base)
-            self.om.objects.update({thread.address: thread})
-            if self.curr_process:
-                thread.process = self.curr_process
-                self.curr_process.threads.append(thread)
-            run.thread = thread
+        proc = self._resolve_run_process(run)
+        run.process_context = proc
+        self.set_current_process(proc)
+        self.alloc_peb(proc)
+
+        thread = self._resolve_run_thread(run, proc)
+        if thread:
             self.set_current_thread(thread)
+            run.thread = thread
 
         if not self.kernel_mode:
-            # Reset the TIB data
             thread = self.get_current_thread()
             if thread:
                 self.init_teb(thread, self.curr_process.peb)  # type: ignore[union-attr]
@@ -517,8 +551,7 @@ def start(self, addr=None, size=None):
         self.set_hooks()
         self._set_emu_hooks()
 
-        # Initialize run context/register state before exposing the target to GDB,
-        # so the first stop reports a meaningful PC/SP/etc.
+        self.reset_stack(self.stack_base)
         self._prepare_run_context(run)
 
         if self.gdb_port is not None:
diff --git a/tests/test_call_api.py b/tests/test_call_api.py
new file mode 100644
index 0000000..28b1838
--- /dev/null
+++ b/tests/test_call_api.py
@@ -0,0 +1,250 @@
+import pytest
+
+from speakeasy import Speakeasy
+from speakeasy.errors import WindowsEmuError
+from speakeasy.profiler import Run
+from speakeasy.windows import objman
+
+DLL_BINS = ["dll_test_x86.dll.xz", "dll_test_x64.dll.xz"]
+EXE_BINS = ["argv_test_x86.exe.xz", "argv_test_x64.exe.xz"]
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_call_without_run_module(config, load_test_bin, bin_file):
+    """call() should work without run_module() being called first (GH-21)."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.call(mod.base + mod.ep, [mod.base, 1, 0])
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_call_after_run_module(config, load_test_bin, bin_file):
+    """call() should still work after run_module() has set up context."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.run_module(mod)
+        se.call(mod.base + mod.ep, [mod.base, 1, 0])
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_call_queued_during_run(config, load_test_bin, bin_file):
+    """call() queued while run_queue is non-empty defers context to execution time."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        ep = mod.base + mod.ep
+        se.call(ep, [mod.base, 1, 0])
+        se.call(ep, [mod.base, 1, 0])
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_call_context_is_consistent(config, load_test_bin, bin_file):
+    """After call(), every run has process_context, thread, and active PEB."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.call(mod.base + mod.ep, [mod.base, 1, 0])
+
+        emu = se.emu
+        assert emu.curr_process is not None, "no process after call()"
+        assert emu.curr_process.is_peb_active, "PEB not activated"
+        assert emu.curr_thread is not None, "no thread after call()"
+        for run in emu.runs:
+            assert run.process_context is not None, f"run {run.type} missing process_context"
+            assert run.thread is not None, f"run {run.type} has no thread"
+            assert run.thread.process is run.process_context, (
+                f"run {run.type} thread.process mismatch"
+            )
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_call_context_consistent_after_run_module(config, load_test_bin, bin_file):
+    """After run_module() + call(), context is consistent across all runs."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.run_module(mod)
+        se.call(mod.base + mod.ep, [mod.base, 1, 0])
+
+        emu = se.emu
+        assert emu.curr_process is not None
+        assert emu.curr_process.is_peb_active
+        assert emu.curr_thread is not None
+        for run in emu.runs:
+            assert run.process_context is not None, f"run {run.type} missing process_context"
+            assert run.thread is not None, f"run {run.type} has no thread"
+            assert run.thread.process is run.process_context, (
+                f"run {run.type} thread.process mismatch"
+            )
+    finally:
+        se.shutdown()
+
+
+# --- EXE call() without run_module() ---
+
+
+@pytest.mark.parametrize("bin_file", EXE_BINS)
+def test_exe_call_without_run_module(config, load_test_bin, bin_file):
+    """call() on an EXE entrypoint creates a module-backed process, not a container."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        ep = mod.base + mod.ep
+        se.call(ep, [0, 0, 0, 0])
+
+        emu = se.emu
+        assert emu.curr_process is not None
+        assert emu.curr_process.pe is mod, "EXE call() should produce a module-backed process"
+        assert emu.curr_process.base == mod.base
+    finally:
+        se.shutdown()
+
+
+# --- Process bookkeeping ---
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_process_in_processes_after_run_module(config, load_test_bin, bin_file):
+    """After run_module(), curr_process must be discoverable in self.processes."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.run_module(mod)
+
+        emu = se.emu
+        assert emu.curr_process is not None
+        assert emu.curr_process in emu.processes, "curr_process not in self.processes"
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", EXE_BINS)
+def test_process_in_processes_after_exe_run_module(config, load_test_bin, bin_file):
+    """After EXE run_module(), the active process is in self.processes."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.run_module(mod)
+
+        emu = se.emu
+        assert emu.curr_process is not None
+        assert emu.curr_process in emu.processes, "EXE curr_process not in self.processes"
+    finally:
+        se.shutdown()
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_process_in_processes_after_call(config, load_test_bin, bin_file):
+    """After call() without run_module(), curr_process is in self.processes."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        se.call(mod.base + mod.ep, [mod.base, 1, 0])
+
+        emu = se.emu
+        assert emu.curr_process is not None
+        assert emu.curr_process in emu.processes
+    finally:
+        se.shutdown()
+
+
+# --- Queued call does not mutate SP ---
+
+
+@pytest.mark.parametrize("bin_file", DLL_BINS)
+def test_queued_call_does_not_mutate_sp(config, load_test_bin, bin_file):
+    """Invoking call() while the queue is non-empty must not change SP/BP."""
+    data = load_test_bin(bin_file)
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+        ep = mod.base + mod.ep
+
+        placeholder = Run()
+        placeholder.type = "placeholder"
+        placeholder.start_addr = ep
+        placeholder.args = [mod.base, 1, 0]
+        se.emu.add_run(placeholder)
+
+        sp_before = se.emu.get_stack_ptr()
+        se.call(ep, [mod.base, 1, 0])
+        sp_after = se.emu.get_stack_ptr()
+
+        assert sp_before == sp_after, (
+            f"call() mutated SP while queue non-empty: {sp_before:#x} -> {sp_after:#x}"
+        )
+    finally:
+        se.shutdown()
+
+
+# --- Shellcode ---
+
+
+@pytest.mark.parametrize("arch", ["x86", "x64"])
+def test_shellcode_context(config, arch):
+    """Shellcode runs get proper process, thread, and register initialization."""
+    sc_data = b"\xc3"  # ret
+    se = Speakeasy(config=config)
+    try:
+        sc_addr = se.load_shellcode("test_shellcode", arch, data=sc_data)
+        se.run_shellcode(sc_addr)
+
+        emu = se.emu
+        assert len(emu.runs) >= 1
+        sc_run = emu.runs[0]
+        assert sc_run.process_context is not None, "shellcode run missing process_context"
+        assert sc_run.thread is not None, "shellcode run missing thread"
+        assert sc_run.thread.process is sc_run.process_context
+        assert sc_run.process_context in emu.processes
+    finally:
+        se.shutdown()
+
+
+# --- Fail-fast on conflicting thread/process ---
+
+
+def test_conflicting_thread_process_raises(config, load_test_bin):
+    """A run whose thread is bound to a different process should fail fast."""
+    data = load_test_bin("dll_test_x86.dll.xz")
+    se = Speakeasy(config=config)
+    try:
+        mod = se.load_module(data=data)
+
+        proc_a = objman.Process(se.emu)
+        proc_b = objman.Process(se.emu)
+        thread = objman.Thread(se.emu, stack_base=se.emu.stack_base)
+        thread.process = proc_a
+        proc_a.threads.append(thread)
+
+        run = Run()
+        run.type = "test_conflict"
+        run.start_addr = mod.base + mod.ep
+        run.args = [mod.base, 1, 0]
+        run.process_context = proc_b
+        run.thread = thread
+
+        se.emu.add_run(run)
+
+        with pytest.raises(WindowsEmuError, match="different process"):
+            se.emu.start()
+    finally:
+        se.shutdown()