diff --git a/centipede/BUILD b/centipede/BUILD
index cbc0df63a..faa493b46 100644
--- a/centipede/BUILD
+++ b/centipede/BUILD
@@ -115,6 +115,15 @@ cc_library(
     hdrs = ["int_utils.h"],
 )
 
+cc_library(
+    name = "flag_util",
+    srcs = ["flag_util.cc"],
+    hdrs = ["flag_util.h"],
+    deps = [
+        "@abseil-cpp//absl/base:nullability",
+    ],
+)
+
 cc_library(
     name = "rolling_hash",
     hdrs = ["rolling_hash.h"],
@@ -1024,6 +1033,8 @@ cc_library(
 #  e.g. feature.cc. These files are compiled by the engine and the runner
 #  separately, with different compiler flags.
 RUNNER_SOURCES_NO_MAIN = [
+    "coverage_state.cc",
+    "coverage_state.h",
     "byte_array_mutator.cc",
     "byte_array_mutator.h",
     "callstack.h",
@@ -1210,6 +1221,35 @@ cc_library(
     deps = ["@abseil-cpp//absl/flags:flag"],
 )
 
+cc_library(
+    name = "coverage_state",
+    srcs = [
+        "coverage_state.cc",
+        "runner_dl_info.cc",
+        "runner_dl_info.h",
+        "runner_interceptors.cc",
+        "runner_sancov.cc",
+        "runner_sancov_object.cc",
+        "runner_sancov_object.h",
+        "runner_utils.cc",
+        "runner_utils.h",
+    ],
+    hdrs = ["coverage_state.h"],
+    deps = [
+        ":callstack",
+        ":feature",
+        ":foreach_nonzero",
+        ":int_utils",
+        ":pc_info",
+        ":reverse_pc_table",
+        ":runner_cmp_trace",
+        ":runner_result",
+        "@abseil-cpp//absl/base:core_headers",
+        "@abseil-cpp//absl/base:nullability",
+        "@abseil-cpp//absl/numeric:bits",
+    ],
+)
+
 ################################################################################
 #                        General-purpose testing utilities
 ################################################################################
diff --git a/centipede/control_flow.cc b/centipede/control_flow.cc
index 68c443ee0..7a7deecf9 100644
--- a/centipede/control_flow.cc
+++ b/centipede/control_flow.cc
@@ -45,9 +45,12 @@
 namespace fuzztest::internal {
 
 PCTable ReadPcTableFromFile(std::string_view file_path) {
+  LOG(INFO) << "ReadPcTableFromFile: " << file_path << "\n";
   ByteArray pc_infos_as_bytes;
   ReadFromLocalFile(file_path, pc_infos_as_bytes);
   CHECK_EQ(pc_infos_as_bytes.size() % sizeof(PCInfo), 0);
+  LOG(INFO) << "size of pc_infos_as_bytes: " << pc_infos_as_bytes.size()
+            << "\n";
   size_t pc_table_size = pc_infos_as_bytes.size() / sizeof(PCInfo);
   const auto *pc_infos = reinterpret_cast<PCInfo *>(pc_infos_as_bytes.data());
   PCTable pc_table{pc_infos, pc_infos + pc_table_size};
diff --git a/centipede/coverage_state.cc b/centipede/coverage_state.cc
new file mode 100644
index 000000000..b0f19ab84
--- /dev/null
+++ b/centipede/coverage_state.cc
@@ -0,0 +1,355 @@
+// Copyright 2022 The Centipede Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "./centipede/coverage_state.h"
+
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include "absl/base/nullability.h"
+#include "./centipede/feature.h"
+#include "./centipede/int_utils.h"
+#include "./centipede/runner_result.h"
+#include "./centipede/runner_utils.h"
+
+namespace fuzztest::internal {
+namespace {
+
+// Returns the length of the common prefix of `s1` and `s2`, but not more
+// than 63. I.e. the returned value is in [0, 64).
+size_t LengthOfCommonPrefix(const void *s1, const void *s2, size_t n) {
+  const auto *p1 = static_cast<const uint8_t *>(s1);
+  const auto *p2 = static_cast<const uint8_t *>(s2);
+  static constexpr size_t kMaxLen = 63;
+  if (n > kMaxLen) n = kMaxLen;
+  for (size_t i = 0; i < n; ++i) {
+    if (p1[i] != p2[i]) return i;
+  }
+  return n;
+}
+
+class ThreadTerminationDetector {
+ public:
+  // A dummy method to trigger the construction and make sure that the
+  // destructor will be called on the thread termination.
+  __attribute__((optnone)) void EnsureAlive() {}
+
+  ~ThreadTerminationDetector() { tls.OnThreadStop(); }
+};
+
+thread_local ThreadTerminationDetector termination_detector;
+
+}  // namespace
+
+CoverageState coverage_state __attribute__((init_priority(200)));
+
+// We use __thread instead of thread_local so that the compiler warns if
+// the initializer for `tls` is not a constant expression.
+// `tls` thus must not have a CTOR.
+// This avoids calls to __tls_init() in hot functions that use `tls`.
+__thread ThreadLocalRunnerState tls;
+
+extern "C" __attribute__((weak)) const char *absl_nullable
+CentipedeGetRunnerFlags() {
+  if (const char *runner_flags_env = getenv("CENTIPEDE_RUNNER_FLAGS"))
+    return strdup(runner_flags_env);
+  return nullptr;
+}
+
+__attribute__((noinline)) void CheckStackLimit(uintptr_t sp) {
+  static std::atomic_flag stack_limit_exceeded = ATOMIC_FLAG_INIT;
+  const size_t stack_limit = coverage_state.run_time_flags.stack_limit_kb.load()
+                             << 10;
+  // Check for the stack limit only if sp is inside the stack region.
+  if (stack_limit > 0 && tls.stack_region_low &&
+      tls.top_frame_sp - sp > stack_limit) {
+    if (!coverage_state.test_started) return;
+    if (stack_limit_exceeded.test_and_set()) return;
+    fprintf(stderr,
+            "========= Stack limit exceeded: %" PRIuPTR
+            " > %zu"
+            " (byte); aborting\n",
+            tls.top_frame_sp - sp, stack_limit);
+    CentipedeSetFailureDescription(
+        fuzztest::internal::kExecutionFailureStackLimitExceeded.data());
+    std::abort();
+  }
+}
+
+void CoverageState::CleanUpDetachedTls() {
+  LockGuard lock(tls_list_mu);
+  while (detached_tls_list) {
+    ThreadLocalRunnerState *tls = detached_tls_list;
+    detached_tls_list = detached_tls_list->next;
+    delete tls;
+  }
+}
+
+__attribute__((noinline))  // so that we see it in profile.
+void PrepareCoverage(bool full_clear) {
+  coverage_state.CleanUpDetachedTls();
+  if (coverage_state.run_time_flags.path_level != 0) {
+    coverage_state.ForEachTls([](ThreadLocalRunnerState &tls) {
+      tls.path_ring_buffer.Reset(coverage_state.run_time_flags.path_level);
+      tls.call_stack.Reset(coverage_state.run_time_flags.callstack_level);
+      tls.lowest_sp = tls.top_frame_sp;
+    });
+  }
+  {
+    fuzztest::internal::LockGuard lock(
+        coverage_state.execution_result_override_mu);
+    if (coverage_state.execution_result_override != nullptr) {
+      coverage_state.execution_result_override->ClearAndResize(0);
+    }
+  }
+  if (!full_clear) return;
+  coverage_state.ForEachTls([](ThreadLocalRunnerState &tls) {
+    if (coverage_state.run_time_flags.use_auto_dictionary) {
+      tls.cmp_trace2.Clear();
+      tls.cmp_trace4.Clear();
+      tls.cmp_trace8.Clear();
+      tls.cmp_traceN.Clear();
+    }
+  });
+  coverage_state.pc_counter_set.ForEachNonZeroByte(
+      [](size_t idx, uint8_t value) {}, 0,
+      coverage_state.actual_pc_counter_set_size_aligned);
+  if (coverage_state.run_time_flags.use_dataflow_features)
+    coverage_state.data_flow_feature_set.ForEachNonZeroBit([](size_t idx) {});
+  if (coverage_state.run_time_flags.use_cmp_features) {
+    coverage_state.cmp_feature_set.ForEachNonZeroBit([](size_t idx) {});
+    coverage_state.cmp_eq_set.ForEachNonZeroBit([](size_t idx) {});
+    coverage_state.cmp_moddiff_set.ForEachNonZeroBit([](size_t idx) {});
+    coverage_state.cmp_hamming_set.ForEachNonZeroBit([](size_t idx) {});
+    coverage_state.cmp_difflog_set.ForEachNonZeroBit([](size_t idx) {});
+  }
+  if (coverage_state.run_time_flags.path_level != 0)
+    coverage_state.path_feature_set.ForEachNonZeroBit([](size_t idx) {});
+  if (coverage_state.run_time_flags.callstack_level != 0)
+    coverage_state.callstack_set.ForEachNonZeroBit([](size_t idx) {});
+  for (auto *p = coverage_state.user_defined_begin;
+       p != coverage_state.user_defined_end; ++p) {
+    *p = 0;
+  }
+  coverage_state.sancov_objects.ClearInlineCounters();
+  coverage_state.test_started = true;
+}
+
+static void MaybeAddFeature(feature_t feature) {
+  if (!coverage_state.run_time_flags.skip_seen_features) {
+    coverage_state.g_features.push_back(feature);
+  } else if (!coverage_state.seen_features.get(feature)) {
+    coverage_state.g_features.push_back(feature);
+    coverage_state.seen_features.set(feature);
+  }
+}
+
+// Adds a kPCs and/or k8bitCounters feature to `g_features` based on arguments.
+// `idx` is a pc_index.
+// `counter_value` (non-zero) is a counter value associated with that PC.
+static void AddPcIndxedAndCounterToFeatures(size_t idx, uint8_t counter_value) {
+  if (coverage_state.run_time_flags.use_pc_features) {
+    MaybeAddFeature(feature_domains::kPCs.ConvertToMe(idx));
+  }
+  if (coverage_state.run_time_flags.use_counter_features) {
+    MaybeAddFeature(feature_domains::k8bitCounters.ConvertToMe(
+        Convert8bitCounterToNumber(idx, counter_value)));
+  }
+}
+__attribute__((noinline))  // so that we see it in profile.
+void PostProcessCoverage(int target_return_value) {
+  coverage_state.g_features.clear();
+
+  if (target_return_value == -1) return;
+
+  // Convert counters to features.
+  coverage_state.pc_counter_set.ForEachNonZeroByte(
+      [](size_t idx, uint8_t value) {
+        AddPcIndxedAndCounterToFeatures(idx, value);
+      },
+      0, coverage_state.actual_pc_counter_set_size_aligned);
+
+  // Convert data flow bit set to features.
+  if (coverage_state.run_time_flags.use_dataflow_features) {
+    coverage_state.data_flow_feature_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kDataFlow.ConvertToMe(idx));
+    });
+  }
+
+  // Convert cmp bit set to features.
+  if (coverage_state.run_time_flags.use_cmp_features) {
+    // TODO(kcc): remove cmp_feature_set.
+    coverage_state.cmp_feature_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCMP.ConvertToMe(idx));
+    });
+    coverage_state.cmp_eq_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCMPEq.ConvertToMe(idx));
+    });
+    coverage_state.cmp_moddiff_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCMPModDiff.ConvertToMe(idx));
+    });
+    coverage_state.cmp_hamming_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCMPHamming.ConvertToMe(idx));
+    });
+    coverage_state.cmp_difflog_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCMPDiffLog.ConvertToMe(idx));
+    });
+  }
+
+  // Convert path bit set to features.
+  if (coverage_state.run_time_flags.path_level != 0) {
+    coverage_state.path_feature_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kBoundedPath.ConvertToMe(idx));
+    });
+  }
+
+  // Iterate all threads and get features from TLS data.
+  coverage_state.ForEachTls([](ThreadLocalRunnerState &tls) {
+    if (coverage_state.run_time_flags.callstack_level != 0) {
+      RunnerCheck(tls.top_frame_sp >= tls.lowest_sp,
+                  "bad values of tls.top_frame_sp and tls.lowest_sp");
+      size_t sp_diff = tls.top_frame_sp - tls.lowest_sp;
+      MaybeAddFeature(feature_domains::kCallStack.ConvertToMe(sp_diff));
+    }
+  });
+
+  if (coverage_state.run_time_flags.callstack_level != 0) {
+    coverage_state.callstack_set.ForEachNonZeroBit([](size_t idx) {
+      MaybeAddFeature(feature_domains::kCallStack.ConvertToMe(idx));
+    });
+  }
+
+  // Copy the features from __centipede_extra_features to g_features.
+  // Zero features are ignored - we treat them as default (unset) values.
+  for (auto *p = coverage_state.user_defined_begin;
+       p != coverage_state.user_defined_end; ++p) {
+    if (auto user_feature = *p) {
+      // User domain ID is upper 32 bits
+      feature_t user_domain_id = user_feature >> 32;
+      // User feature ID is lower 32 bits.
+      feature_t user_feature_id = user_feature & ((1ULL << 32) - 1);
+      // There is no hard guarantee how many user domains are actually
+      // available. If a user domain ID is out of range, alias it to an existing
+      // domain. This is kinder than silently dropping the feature.
+      user_domain_id %= std::size(feature_domains::kUserDomains);
+      MaybeAddFeature(feature_domains::kUserDomains[user_domain_id].ConvertToMe(
+          user_feature_id));
+      *p = 0;  // cleanup for the next iteration.
+    }
+  }
+
+  // Iterates all non-zero inline 8-bit counters, if they are present.
+  // Calls AddPcIndxedAndCounterToFeatures on non-zero counters and zeroes them.
+  if (coverage_state.run_time_flags.use_pc_features ||
+      coverage_state.run_time_flags.use_counter_features) {
+    coverage_state.sancov_objects.ForEachNonZeroInlineCounter(
+        [](size_t idx, uint8_t counter_value) {
+          AddPcIndxedAndCounterToFeatures(idx, counter_value);
+        });
+  }
+}
+
+void ThreadLocalRunnerState::TraceMemCmp(uintptr_t caller_pc, const uint8_t *s1,
+                                         const uint8_t *s2, size_t n,
+                                         bool is_equal) {
+  if (coverage_state.run_time_flags.use_cmp_features) {
+    const uintptr_t pc_offset =
+        caller_pc - coverage_state.main_object.start_address;
+    const uintptr_t hash =
+        fuzztest::internal::Hash64Bits(pc_offset) ^ tls.path_ring_buffer.hash();
+    const size_t lcp = LengthOfCommonPrefix(s1, s2, n);
+    // lcp is a 6-bit number.
+    coverage_state.cmp_feature_set.set((hash << 6) | lcp);
+  }
+  if (!is_equal && coverage_state.run_time_flags.use_auto_dictionary) {
+    cmp_traceN.Capture(n, s1, s2);
+  }
+}
+
+void ThreadLocalRunnerState::OnThreadStart() {
+  termination_detector.EnsureAlive();
+  tls.started = true;
+  tls.lowest_sp = tls.top_frame_sp =
+      reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
+  tls.stack_region_low = GetCurrentThreadStackRegionLow();
+  if (tls.stack_region_low == 0) {
+    fprintf(stderr,
+            "Disabling stack limit check due to missing stack region info.\n");
+  }
+  tls.call_stack.Reset(coverage_state.run_time_flags.callstack_level);
+  tls.path_ring_buffer.Reset(coverage_state.run_time_flags.path_level);
+  LockGuard lock(coverage_state.tls_list_mu);
+  // Add myself to state.tls_list.
+  auto *old_list = coverage_state.tls_list;
+  tls.next = old_list;
+  coverage_state.tls_list = &tls;
+  if (old_list != nullptr) old_list->prev = &tls;
+}
+
+void ThreadLocalRunnerState::OnThreadStop() {
+  LockGuard lock(coverage_state.tls_list_mu);
+  // Remove myself from state.tls_list. The list never
+  // becomes empty because the main thread does not call OnThreadStop().
+  if (&tls == coverage_state.tls_list) {
+    coverage_state.tls_list = tls.next;
+    tls.prev = nullptr;
+  } else {
+    auto *prev_tls = tls.prev;
+    auto *next_tls = tls.next;
+    prev_tls->next = next_tls;
+    if (next_tls != nullptr) next_tls->prev = prev_tls;
+  }
+  tls.next = tls.prev = nullptr;
+  if (tls.ignore) return;
+  // Create a detached copy on heap and add it to detached_tls_list to
+  // collect its coverage later.
+  //
+  // TODO(xinhaoyuan): Consider refactoring the list operations into class
+  // methods instead of duplicating them.
+  ThreadLocalRunnerState *detached_tls = new ThreadLocalRunnerState(tls);
+  auto *old_list = coverage_state.detached_tls_list;
+  detached_tls->next = old_list;
+  coverage_state.detached_tls_list = detached_tls;
+  if (old_list != nullptr) old_list->prev = detached_tls;
+}
+
+extern "C" void CentipedeSetFailureDescription(const char *description) {
+  using fuzztest::internal::coverage_state;
+  if (coverage_state.failure_description_path == nullptr) return;
+  // Make sure that the write is atomic and only happens once.
+  [[maybe_unused]] static int write_once = [=] {
+    FILE *f = fopen(coverage_state.failure_description_path, "w");
+    if (f == nullptr) {
+      perror("FAILURE: fopen()");
+      return 0;
+    }
+    const auto len = strlen(description);
+    if (fwrite(description, 1, len, f) != len) {
+      perror("FAILURE: fwrite()");
+    }
+    if (fflush(f) != 0) {
+      perror("FAILURE: fflush()");
+    }
+    if (fclose(f) != 0) {
+      perror("FAILURE: fclose()");
+    }
+    return 0;
+  }();
+}
+
+}  // namespace fuzztest::internal
diff --git a/centipede/coverage_state.h b/centipede/coverage_state.h
new file mode 100644
index 000000000..777bae467
--- /dev/null
+++ b/centipede/coverage_state.h
@@ -0,0 +1,319 @@
+// Copyright 2022 The Centipede Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUZZTEST_CENTIPEDE_COVERAGE_STATE_H_
+#define FUZZTEST_CENTIPEDE_COVERAGE_STATE_H_
+
+#include <pthread.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "absl/base/const_init.h"
+#include "absl/base/nullability.h"
+#include "absl/numeric/bits.h"
+#include "./centipede/callstack.h"
+#include "./centipede/concurrent_bitset.h"
+#include "./centipede/concurrent_byteset.h"
+#include "./centipede/feature.h"
+#include "./centipede/hashed_ring_buffer.h"
+#include "./centipede/reverse_pc_table.h"
+#include "./centipede/runner_cmp_trace.h"
+#include "./centipede/runner_dl_info.h"
+#include "./centipede/runner_result.h"
+#include "./centipede/runner_sancov_object.h"
+
+namespace fuzztest::internal {
+
+// Like std::lock_guard, but for pthread_mutex_t.
+class LockGuard {
+ public:
+  explicit LockGuard(pthread_mutex_t &mu) : mu_(mu) { pthread_mutex_lock(&mu); }
+  ~LockGuard() { pthread_mutex_unlock(&mu_); }
+
+ private:
+  pthread_mutex_t &mu_;
+};
+
+// One such object is created in runner's TLS.
+// There is no CTOR, since we don't want to use the brittle and lazy TLS CTORs.
+// All data members are zero-initialized during thread creation.
+struct ThreadLocalRunnerState {
+  // Traces the memory comparison of `n` bytes at `s1` and `s2` called at
+  // `caller_pc` with `is_equal` indicating whether the two memory regions have
+  // equal contents. May add cmp features and auto-dictionary entries if
+  // enabled.
+  void TraceMemCmp(uintptr_t caller_pc, const uint8_t *s1, const uint8_t *s2,
+                   size_t n, bool is_equal);
+
+  // Intrusive doubly-linked list of TLS objects.
+  // Guarded by state.tls_list_mu.
+  ThreadLocalRunnerState *next, *prev;
+
+  // The pthread_create() interceptor calls OnThreadStart() before the thread
+  // callback. The main thread also calls OnThreadStart(). OnThreadStop() will
+  // be called when thread termination is detected internally - see runner.cc.
+  void OnThreadStart();
+  void OnThreadStop();
+
+  // Whether OnThreadStart() is called on this thread. This is used as a proxy
+  // of the readiness of the lower-level runtime.
+  bool started;
+
+  // Paths are thread-local, so we maintain the current bounded path here.
+  // We allow paths of up to 100, controlled at run-time via the "path_level".
+  static constexpr uint64_t kBoundedPathLength = 100;
+  HashedRingBuffer<kBoundedPathLength> path_ring_buffer;
+
+  // Value of SP in the top call frame of the thread, computed in OnThreadStart.
+  uintptr_t top_frame_sp;
+  // The lower bound of the stack region of this thread. 0 means unknown.
+  uintptr_t stack_region_low;
+  // Lowest observed value of SP.
+  uintptr_t lowest_sp;
+
+  // The (imprecise) call stack is updated by the PC callback.
+  CallStack<> call_stack;
+
+  // Cmp traces capture the arguments of CMP instructions, memcmp, etc.
+  // We have dedicated traces for 2-, 4-, and 8-byte comparison, and
+  // a catch-all `cmp_traceN` trace for memcmp, etc.
+  CmpTrace<2, 64> cmp_trace2;
+  CmpTrace<4, 64> cmp_trace4;
+  CmpTrace<8, 64> cmp_trace8;
+  CmpTrace<0, 64> cmp_traceN;
+
+  // Set this to true if the thread needs to be ignored in ForEachTLS.
+  // It should be always false if the state is in the global detached_tls_list.
+  bool ignore;
+};
+
+// Flags derived from CENTIPEDE_RUNNER_FLAGS.
+// Flags used in instrumentation callbacks are bit-packed for efficiency.
+struct CoverageFlags {
+  uint64_t path_level : 8;
+  uint64_t use_pc_features : 1;
+  uint64_t use_cmp_features : 1;
+  uint64_t callstack_level : 8;
+  uint64_t use_counter_features : 1;
+  uint64_t use_auto_dictionary : 1;
+  uint64_t skip_seen_features : 1;
+  uint64_t use_dataflow_features : 1;
+  std::atomic<uint64_t> stack_limit_kb;
+};
+
+extern "C" __attribute__((weak)) const char *absl_nullable
+CentipedeGetRunnerFlags();
+
+// Set the failure description for the runner to propagate further. Only the
+// description from the first call will be used.
+extern "C" void CentipedeSetFailureDescription(const char *description);
+
+struct CoverageState {
+  const char *centipede_runner_flags = CentipedeGetRunnerFlags();
+
+  // The path to a file where the runner may write the description of failure.
+  const char *failure_description_path =
+      GetStringFlag(":failure_description_path=");
+
+  // Flags.
+  CoverageFlags run_time_flags = {
+      /*path_level=*/std::min(ThreadLocalRunnerState::kBoundedPathLength,
+                              HasIntFlag(":path_level=", 0)),
+      /*use_pc_features=*/HasFlag(":use_pc_features:"),
+      /*use_cmp_features=*/HasFlag(":use_cmp_features:"),
+      /*callstack_level=*/HasIntFlag(":callstack_level=", 0),
+      /*use_counter_features=*/HasFlag(":use_counter_features:"),
+      /*use_auto_dictionary=*/HasFlag(":use_auto_dictionary:"),
+      /*skip_seen_features=*/HasFlag(":skip_seen_features:"),
+      /*use_dataflow_features=*/HasFlag(":use_dataflow_features:"),
+      /*stack_limit_kb=*/HasIntFlag(":stack_limit_kb=", 0),
+  };
+
+  // Returns true iff `flag` is present.
+  // Typical usage: pass ":some_flag:", i.e. the flag name surrounded with ':'.
+  // TODO(ussuri): Refactor `char *` into a `string_view`.
+  bool HasFlag(const char *absl_nonnull flag) const {
+    if (!centipede_runner_flags) return false;
+    return strstr(centipede_runner_flags, flag) != nullptr;
+  }
+
+  // If a flag=value pair is present, returns value,
+  // otherwise returns `default_value`.
+  // Typical usage: pass ":some_flag=".
+  // TODO(ussuri): Refactor `char *` into a `string_view`.
+  uint64_t HasIntFlag(const char *absl_nonnull flag,
+                      uint64_t default_value) const {
+    if (!centipede_runner_flags) return default_value;
+    const char *beg = strstr(centipede_runner_flags, flag);
+    if (!beg) return default_value;
+    return atoll(beg + strlen(flag));  // NOLINT: can't use strto64, etc.
+  }
+
+  // If a :flag=value: pair is present returns value, otherwise returns nullptr.
+  // The result is obtained by calling strndup, so make sure to save
+  // it in `this` to avoid a leak.
+  // Typical usage: pass ":some_flag=".
+  // TODO(ussuri): Refactor `char *` into a `string_view`.
+  const char *absl_nullable GetStringFlag(const char *absl_nonnull flag) const {
+    if (!centipede_runner_flags) return nullptr;
+    // Extract "value" from ":flag=value:" inside centipede_runner_flags.
+    const char *beg = strstr(centipede_runner_flags, flag);
+    if (!beg) return nullptr;
+    const char *value_beg = beg + strlen(flag);
+    const char *end = strstr(value_beg, ":");
+    if (!end) return nullptr;
+    return strndup(value_beg, end - value_beg);
+  }
+
+  bool test_started = false;
+
+  // State for SanitizerCoverage.
+  // See https://clang.llvm.org/docs/SanitizerCoverage.html.
+  SanCovObjectArray sancov_objects;
+
+  // Computed by DlInfo().
+  // Usually, the main object is the executable binary containing main()
+  // and most of the executable code (we assume that the target is
+  // built in mostly-static mode, i.e. -dynamic_mode=off).
+  // When the `dl_path_suffix` runner flag is provided, the main_object refers
+  // to the dynamic library (DSO) pointed to by this flag.
+  //
+  // Note: this runner currently does not support more than one instrumented
+  // DSO in the process, i.e. you either instrument the main binary, or one DSO.
+  // Supporting more than one DSO will require major changes,
+  // major added complexity, and potentially cause slowdown.
+  // There is currently no motivation for such a change.
+  DlInfo main_object;
+
+  // Used by trace_pc instrumentation. Populated if `pcs_file_path` flag is set.
+  ReversePCTable reverse_pc_table;
+
+  // kMaxNumPcs is the maximum number of instrumented PCs in the binary.
+  // We can be generous here since the unused memory will not cost anything.
+  // `pc_counter_set` is a static byte set supporting up to kMaxNumPcs PCs.
+  static constexpr size_t kMaxNumPcs = 1 << 28;
+  TwoLayerConcurrentByteSet<kMaxNumPcs> pc_counter_set{absl::kConstInit};
+
+  // This is the actual number of PCs, aligned up to
+  // pc_counter_set::kSizeMultiple, computed at startup.
+  size_t actual_pc_counter_set_size_aligned;
+
+  // An arbitrarily large size.
+  static constexpr size_t kDataFlowFeatureSetSize = 1 << 18;
+  ConcurrentBitSet<kDataFlowFeatureSetSize> data_flow_feature_set{
+      absl::kConstInit};
+
+  // Tracing CMP instructions, capture events from these domains:
+  // kCMPEq, kCMPModDiff, kCMPHamming, kCMPModDiffLog, kCMPMsbEq.
+  // See https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow.
+  // An arbitrarily large size.
+  static constexpr size_t kCmpFeatureSetSize = 1 << 18;
+  // TODO(kcc): remove cmp_feature_set.
+  ConcurrentBitSet<kCmpFeatureSetSize> cmp_feature_set{absl::kConstInit};
+  ConcurrentBitSet<kCmpFeatureSetSize> cmp_eq_set{absl::kConstInit};
+  ConcurrentBitSet<kCmpFeatureSetSize> cmp_moddiff_set{absl::kConstInit};
+  ConcurrentBitSet<kCmpFeatureSetSize> cmp_hamming_set{absl::kConstInit};
+  ConcurrentBitSet<kCmpFeatureSetSize> cmp_difflog_set{absl::kConstInit};
+
+  // We think that call stack produces rich signal, so we give a few bits to it.
+  static constexpr size_t kCallStackFeatureSetSize = 1 << 24;
+  ConcurrentBitSet<kCallStackFeatureSetSize> callstack_set{absl::kConstInit};
+
+  // Initialized in CTOR from the __centipede_extra_features section.
+  feature_t *user_defined_begin;
+  feature_t *user_defined_end;
+
+  pthread_mutex_t execution_result_override_mu = PTHREAD_MUTEX_INITIALIZER;
+  // If not nullptr, it points to a batch result with either zero or one
+  // execution. When an execution result present, it will be passed as the
+  // execution result of the current test input. The object is owned and cleaned
+  // up by the state, protected by execution_result_override_mu, and set by
+  // `CentipedeSetExecutionResult()`.
+  BatchResult *execution_result_override;
+
+  // An arbitrarily large size.
+  static constexpr size_t kPathBitSetSize = 1 << 25;
+  // Observed paths. The total number of observed paths for --path_level=N
+  // can be up to NumPCs**N.
+  // So, we make the bitset very large, but it may still saturate.
+  ConcurrentBitSet<kPathBitSetSize> path_feature_set{absl::kConstInit};
+
+  // Doubly linked list of TLSs of all live threads.
+  ThreadLocalRunnerState *tls_list;
+  // Doubly linked list of detached TLSs.
+  ThreadLocalRunnerState *detached_tls_list;
+  // Guards `tls_list` and `detached_tls_list`.
+  pthread_mutex_t tls_list_mu = PTHREAD_MUTEX_INITIALIZER;
+  // Iterates all TLS objects under tls_list_mu, except those with `ignore` set.
+  // Calls `callback()` on every TLS.
+  template <typename Callback>
+  void ForEachTls(Callback callback) {
+    LockGuard lock(tls_list_mu);
+    for (auto *it = tls_list; it; it = it->next) {
+      if (!it->ignore) callback(*it);
+    }
+    for (auto *it = detached_tls_list; it; it = it->next) {
+      callback(*it);
+    }
+  }
+
+  // Reclaims all TLSs in detached_tls_list and cleans up the list.
+  void CleanUpDetachedTls();
+
+  // An arbitrarily large size.
+  static const size_t kMaxFeatures = 1 << 20;
+  // FeatureArray used to accumulate features from all sources.
+  FeatureArray<kMaxFeatures> g_features;
+
+  // Features that were seen before.
+  static constexpr size_t kSeenFeatureSetSize =
+      absl::bit_ceil(feature_domains::kLastDomain.end());
+  ConcurrentBitSet<kSeenFeatureSetSize> seen_features{absl::kConstInit};
+};
+
+// Check for stack limit for the stack pointer `sp` in the current thread.
+void CheckStackLimit(uintptr_t sp);
+
+// Clears all coverage data.
+// All bitsets, counter arrays and such need to be clear before every execution.
+// However, clearing them is expensive because they are sparse.
+// Instead, we rely on ForEachNonZeroByte() and
+// ConcurrentBitSet::ForEachNonZeroBit to clear the bits/bytes after they
+// finish iterating.
+// We still need to clear all the thread-local data updated during execution.
+// If `full_clear==true` clear all coverage anyway - useful to remove the
+// coverage accumulated during startup.
+__attribute__((noinline))  // so that we see it in profile.
+void PrepareCoverage(bool full_clear);
+
+// Post-processes all coverage data, puts it all into `g_features`.
+// `target_return_value` is the value returned by LLVMFuzzerTestOneInput.
+//
+// If `target_return_value == -1`, sets `g_features` to empty.  This way,
+// the engine will reject any input that causes the target to return -1.
+// LibFuzzer supports this return value as of 2022-07:
+// https://llvm.org/docs/LibFuzzer.html#rejecting-unwanted-inputs
+__attribute__((noinline))  // so that we see it in profile.
+void PostProcessCoverage(int target_return_value);
+
+extern CoverageState coverage_state;
+extern __thread ThreadLocalRunnerState tls;
+
+}  // namespace fuzztest::internal
+
+#endif  // FUZZTEST_CENTIPEDE_COVERAGE_STATE_H_
diff --git a/centipede/flag_util.cc b/centipede/flag_util.cc
new file mode 100644
index 000000000..758a828f0
--- /dev/null
+++ b/centipede/flag_util.cc
@@ -0,0 +1,46 @@
+// Copyright 2022 The Centipede Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "./centipede/flag_util.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstdint>
+
+#include "absl/base/nullability.h"
+
+extern "C" __attribute__((weak)) const char *absl_nullable
+CentipedeGetRunnerFlags() {
+  if (const char *runner_flags_env = getenv("CENTIPEDE_RUNNER_FLAGS"))
+    return strdup(runner_flags_env);
+  return nullptr;
+}
+
+static const char *centipede_runner_flags = CentipedeGetRunnerFlags();
+
+bool HasFlag(const char *absl_nonnull flag) {
+  fprintf(stderr, "HasFlag %s\n", flag);
+  fprintf(stderr, "centipede_runner_flags %s\n", centipede_runner_flags);
+  if (!centipede_runner_flags) return false;
+  return strstr(centipede_runner_flags, flag) != nullptr;
+}
+
+uint64_t HasIntFlag(const char *absl_nonnull flag, uint64_t default_value) {
+  if (!centipede_runner_flags) return default_value;
+  const char *beg = strstr(centipede_runner_flags, flag);
+  if (!beg) return default_value;
+  return atoll(beg + strlen(flag));  // NOLINT: can't use strto64, etc.
+}
diff --git a/centipede/flag_util.h b/centipede/flag_util.h
new file mode 100644
index 000000000..e6b6a8936
--- /dev/null
+++ b/centipede/flag_util.h
@@ -0,0 +1,35 @@
+// Copyright 2022 The Centipede Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUZZTEST_CENTIPEDE_FLAG_UTIL_H_
+#define FUZZTEST_CENTIPEDE_FLAG_UTIL_H_
+
+#include <cstdint>
+
+#include "absl/base/nullability.h"
+
+extern "C" const char *absl_nullable CentipedeGetRunnerFlags();
+
+// Returns true iff `flag` is present.
+// Typical usage: pass ":some_flag:", i.e. the flag name surrounded with ':'.
+// TODO(ussuri): Refactor `char *` into a `string_view`.
+bool HasFlag(const char *absl_nonnull flag);
+
+// If a flag=value pair is present, returns value,
+// otherwise returns `default_value`.
+// Typical usage: pass ":some_flag=".
+// TODO(ussuri): Refactor `char *` into a `string_view`.
+uint64_t HasIntFlag(const char *absl_nonnull flag, uint64_t default_value);
+
+#endif  // FUZZTEST_CENTIPEDE_FLAG_UTIL_H_
diff --git a/centipede/runner.cc b/centipede/runner.cc
index d998350b5..1f34709aa 100644
--- a/centipede/runner.cc
+++ b/centipede/runner.cc
@@ -46,6 +46,7 @@
 
 #include "absl/base/nullability.h"
 #include "./centipede/byte_array_mutator.h"
+#include "./centipede/coverage_state.h"
 #include "./centipede/execution_metadata.h"
 #include "./centipede/feature.h"
 #include "./centipede/int_utils.h"
@@ -65,103 +66,11 @@ __attribute__((weak)) extern fuzztest::internal::feature_t
     __stop___centipede_extra_features;
 
 namespace fuzztest::internal {
-namespace {
-
-// Returns the length of the common prefix of `s1` and `s2`, but not more
-// than 63. I.e. the returned value is in [0, 64).
-size_t LengthOfCommonPrefix(const void *s1, const void *s2, size_t n) {
-  const auto *p1 = static_cast<const uint8_t *>(s1);
-  const auto *p2 = static_cast<const uint8_t *>(s2);
-  static constexpr size_t kMaxLen = 63;
-  if (n > kMaxLen) n = kMaxLen;
-  for (size_t i = 0; i < n; ++i) {
-    if (p1[i] != p2[i]) return i;
-  }
-  return n;
-}
-
-class ThreadTerminationDetector {
- public:
-  // A dummy method to trigger the construction and make sure that the
-  // destructor will be called on the thread termination.
-  __attribute__((optnone)) void EnsureAlive() {}
-
-  ~ThreadTerminationDetector() { tls.OnThreadStop(); }
-};
-
-thread_local ThreadTerminationDetector termination_detector;
 
-}  // namespace
+using fuzztest::internal::coverage_state;
+using fuzztest::internal::tls;
 
 GlobalRunnerState state __attribute__((init_priority(200)));
-// We use __thread instead of thread_local so that the compiler warns if
-// the initializer for `tls` is not a constant expression.
-// `tls` thus must not have a CTOR.
-// This avoids calls to __tls_init() in hot functions that use `tls`.
-__thread ThreadLocalRunnerState tls;
-
-void ThreadLocalRunnerState::TraceMemCmp(uintptr_t caller_pc, const uint8_t *s1,
-                                         const uint8_t *s2, size_t n,
-                                         bool is_equal) {
-  if (state.run_time_flags.use_cmp_features) {
-    const uintptr_t pc_offset = caller_pc - state.main_object.start_address;
-    const uintptr_t hash =
-        fuzztest::internal::Hash64Bits(pc_offset) ^ tls.path_ring_buffer.hash();
-    const size_t lcp = LengthOfCommonPrefix(s1, s2, n);
-    // lcp is a 6-bit number.
-    state.cmp_feature_set.set((hash << 6) | lcp);
-  }
-  if (!is_equal && state.run_time_flags.use_auto_dictionary) {
-    cmp_traceN.Capture(n, s1, s2);
-  }
-}
-
-void ThreadLocalRunnerState::OnThreadStart() {
-  termination_detector.EnsureAlive();
-  tls.started = true;
-  tls.lowest_sp = tls.top_frame_sp =
-      reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
-  tls.stack_region_low = GetCurrentThreadStackRegionLow();
-  if (tls.stack_region_low == 0) {
-    fprintf(stderr,
-            "Disabling stack limit check due to missing stack region info.\n");
-  }
-  tls.call_stack.Reset(state.run_time_flags.callstack_level);
-  tls.path_ring_buffer.Reset(state.run_time_flags.path_level);
-  LockGuard lock(state.tls_list_mu);
-  // Add myself to state.tls_list.
-  auto *old_list = state.tls_list;
-  tls.next = old_list;
-  state.tls_list = &tls;
-  if (old_list != nullptr) old_list->prev = &tls;
-}
-
-void ThreadLocalRunnerState::OnThreadStop() {
-  LockGuard lock(state.tls_list_mu);
-  // Remove myself from state.tls_list. The list never
-  // becomes empty because the main thread does not call OnThreadStop().
-  if (&tls == state.tls_list) {
-    state.tls_list = tls.next;
-    tls.prev = nullptr;
-  } else {
-    auto *prev_tls = tls.prev;
-    auto *next_tls = tls.next;
-    prev_tls->next = next_tls;
-    if (next_tls != nullptr) next_tls->prev = prev_tls;
-  }
-  tls.next = tls.prev = nullptr;
-  if (tls.ignore) return;
-  // Create a detached copy on heap and add it to detached_tls_list to
-  // collect its coverage later.
-  //
-  // TODO(xinhaoyuan): Consider refactoring the list operations into class
-  // methods instead of duplicating them.
-  ThreadLocalRunnerState *detached_tls = new ThreadLocalRunnerState(tls);
-  auto *old_list = state.detached_tls_list;
-  detached_tls->next = old_list;
-  state.detached_tls_list = detached_tls;
-  if (old_list != nullptr) old_list->prev = detached_tls;
-}
 
 static size_t GetPeakRSSMb() {
   struct rusage usage = {};
@@ -276,36 +185,6 @@ static void CheckWatchdogLimits() {
   }
 }
 
-__attribute__((noinline)) void CheckStackLimit(uintptr_t sp) {
-  static std::atomic_flag stack_limit_exceeded = ATOMIC_FLAG_INIT;
-  const size_t stack_limit = state.run_time_flags.stack_limit_kb.load() << 10;
-  // Check for the stack limit only if sp is inside the stack region.
-  if (stack_limit > 0 && tls.stack_region_low &&
-      tls.top_frame_sp - sp > stack_limit) {
-    const bool test_not_running = state.input_start_time == 0;
-    if (test_not_running) return;
-    if (stack_limit_exceeded.test_and_set()) return;
-    fprintf(stderr,
-            "========= Stack limit exceeded: %" PRIuPTR
-            " > %zu"
-            " (byte); aborting\n",
-            tls.top_frame_sp - sp, stack_limit);
-    CentipedeSetFailureDescription(
-        fuzztest::internal::kExecutionFailureStackLimitExceeded.data());
-    std::abort();
-  }
-}
-
-void GlobalRunnerState::CleanUpDetachedTls() {
-  LockGuard lock(tls_list_mu);
-  ThreadLocalRunnerState *it_next = nullptr;
-  for (auto *it = detached_tls_list; it; it = it_next) {
-    it_next = it->next;
-    delete it;
-  }
-  detached_tls_list = nullptr;
-}
-
 void GlobalRunnerState::StartWatchdogThread() {
   fprintf(stderr,
           "Starting watchdog thread: timeout_per_input: %" PRIu64
@@ -314,7 +193,7 @@ void GlobalRunnerState::StartWatchdogThread() {
           state.run_time_flags.timeout_per_input.load(),
           state.run_time_flags.timeout_per_batch,
           state.run_time_flags.rss_limit_mb.load(),
-          state.run_time_flags.stack_limit_kb.load());
+          coverage_state.run_time_flags.stack_limit_kb.load());
   pthread_t watchdog_thread;
   pthread_create(&watchdog_thread, nullptr, WatchdogThread, nullptr);
   pthread_detach(watchdog_thread);
@@ -376,185 +255,6 @@ static void WriteFeaturesToFile(FILE *file, const feature_t *features,
                       "wrong number of bytes written for coverage");
 }
 
-// Clears all coverage data.
-// All bitsets, counter arrays and such need to be clear before every execution.
-// However, clearing them is expensive because they are sparse.
-// Instead, we rely on ForEachNonZeroByte() and
-// ConcurrentBitSet::ForEachNonZeroBit to clear the bits/bytes after they
-// finish iterating.
-// We still need to clear all the thread-local data updated during execution.
-// If `full_clear==true` clear all coverage anyway - useful to remove the
-// coverage accumulated during startup.
-__attribute__((noinline))  // so that we see it in profile.
-static void
-PrepareCoverage(bool full_clear) {
-  state.CleanUpDetachedTls();
-  if (state.run_time_flags.path_level != 0) {
-    state.ForEachTls([](ThreadLocalRunnerState &tls) {
-      tls.path_ring_buffer.Reset(state.run_time_flags.path_level);
-      tls.call_stack.Reset(state.run_time_flags.callstack_level);
-      tls.lowest_sp = tls.top_frame_sp;
-    });
-  }
-  {
-    fuzztest::internal::LockGuard lock(state.execution_result_override_mu);
-    if (state.execution_result_override != nullptr) {
-      state.execution_result_override->ClearAndResize(0);
-    }
-  }
-  if (!full_clear) return;
-  state.ForEachTls([](ThreadLocalRunnerState &tls) {
-    if (state.run_time_flags.use_auto_dictionary) {
-      tls.cmp_trace2.Clear();
-      tls.cmp_trace4.Clear();
-      tls.cmp_trace8.Clear();
-      tls.cmp_traceN.Clear();
-    }
-  });
-  state.pc_counter_set.ForEachNonZeroByte(
-      [](size_t idx, uint8_t value) {}, 0,
-      state.actual_pc_counter_set_size_aligned);
-  if (state.run_time_flags.use_dataflow_features)
-    state.data_flow_feature_set.ForEachNonZeroBit([](size_t idx) {});
-  if (state.run_time_flags.use_cmp_features) {
-    state.cmp_feature_set.ForEachNonZeroBit([](size_t idx) {});
-    state.cmp_eq_set.ForEachNonZeroBit([](size_t idx) {});
-    state.cmp_moddiff_set.ForEachNonZeroBit([](size_t idx) {});
-    state.cmp_hamming_set.ForEachNonZeroBit([](size_t idx) {});
-    state.cmp_difflog_set.ForEachNonZeroBit([](size_t idx) {});
-  }
-  if (state.run_time_flags.path_level != 0)
-    state.path_feature_set.ForEachNonZeroBit([](size_t idx) {});
-  if (state.run_time_flags.callstack_level != 0)
-    state.callstack_set.ForEachNonZeroBit([](size_t idx) {});
-  for (auto *p = state.user_defined_begin; p != state.user_defined_end; ++p) {
-    *p = 0;
-  }
-  state.sancov_objects.ClearInlineCounters();
-}
-
-static void MaybeAddFeature(feature_t feature) {
-  if (!state.run_time_flags.skip_seen_features) {
-    state.g_features.push_back(feature);
-  } else if (!state.seen_features.get(feature)) {
-    state.g_features.push_back(feature);
-    state.seen_features.set(feature);
-  }
-}
-
-// Adds a kPCs and/or k8bitCounters feature to `g_features` based on arguments.
-// `idx` is a pc_index.
-// `counter_value` (non-zero) is a counter value associated with that PC.
-static void AddPcIndxedAndCounterToFeatures(size_t idx, uint8_t counter_value) {
-  if (state.run_time_flags.use_pc_features) {
-    MaybeAddFeature(feature_domains::kPCs.ConvertToMe(idx));
-  }
-  if (state.run_time_flags.use_counter_features) {
-    MaybeAddFeature(feature_domains::k8bitCounters.ConvertToMe(
-        Convert8bitCounterToNumber(idx, counter_value)));
-  }
-}
-
-// Post-processes all coverage data, puts it all into `g_features`.
-// `target_return_value` is the value returned by LLVMFuzzerTestOneInput.
-//
-// If `target_return_value == -1`, sets `g_features` to empty.  This way,
-// the engine will reject any input that causes the target to return -1.
-// LibFuzzer supports this return value as of 2022-07:
-// https://llvm.org/docs/LibFuzzer.html#rejecting-unwanted-inputs
-__attribute__((noinline))  // so that we see it in profile.
-static void
-PostProcessCoverage(int target_return_value) {
-  state.g_features.clear();
-
-  if (target_return_value == -1) return;
-
-  // Convert counters to features.
-  state.pc_counter_set.ForEachNonZeroByte(
-      [](size_t idx, uint8_t value) {
-        AddPcIndxedAndCounterToFeatures(idx, value);
-      },
-      0, state.actual_pc_counter_set_size_aligned);
-
-  // Convert data flow bit set to features.
-  if (state.run_time_flags.use_dataflow_features) {
-    state.data_flow_feature_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kDataFlow.ConvertToMe(idx));
-    });
-  }
-
-  // Convert cmp bit set to features.
-  if (state.run_time_flags.use_cmp_features) {
-    // TODO(kcc): remove cmp_feature_set.
-    state.cmp_feature_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCMP.ConvertToMe(idx));
-    });
-    state.cmp_eq_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCMPEq.ConvertToMe(idx));
-    });
-    state.cmp_moddiff_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCMPModDiff.ConvertToMe(idx));
-    });
-    state.cmp_hamming_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCMPHamming.ConvertToMe(idx));
-    });
-    state.cmp_difflog_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCMPDiffLog.ConvertToMe(idx));
-    });
-  }
-
-  // Convert path bit set to features.
-  if (state.run_time_flags.path_level != 0) {
-    state.path_feature_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kBoundedPath.ConvertToMe(idx));
-    });
-  }
-
-  // Iterate all threads and get features from TLS data.
-  state.ForEachTls([](ThreadLocalRunnerState &tls) {
-    if (state.run_time_flags.callstack_level != 0) {
-      RunnerCheck(tls.top_frame_sp >= tls.lowest_sp,
-                  "bad values of tls.top_frame_sp and tls.lowest_sp");
-      size_t sp_diff = tls.top_frame_sp - tls.lowest_sp;
-      MaybeAddFeature(feature_domains::kCallStack.ConvertToMe(sp_diff));
-    }
-  });
-
-  if (state.run_time_flags.callstack_level != 0) {
-    state.callstack_set.ForEachNonZeroBit([](size_t idx) {
-      MaybeAddFeature(feature_domains::kCallStack.ConvertToMe(idx));
-    });
-  }
-
-  // Copy the features from __centipede_extra_features to g_features.
-  // Zero features are ignored - we treat them as default (unset) values.
-  for (auto *p = state.user_defined_begin; p != state.user_defined_end; ++p) {
-    if (auto user_feature = *p) {
-      // User domain ID is upper 32 bits
-      feature_t user_domain_id = user_feature >> 32;
-      // User feature ID is lower 32 bits.
-      feature_t user_feature_id = user_feature & ((1ULL << 32) - 1);
-      // There is no hard guarantee how many user domains are actually
-      // available. If a user domain ID is out of range, alias it to an existing
-      // domain. This is kinder than silently dropping the feature.
-      user_domain_id %= std::size(feature_domains::kUserDomains);
-      MaybeAddFeature(feature_domains::kUserDomains[user_domain_id].ConvertToMe(
-          user_feature_id));
-      *p = 0;  // cleanup for the next iteration.
-    }
-  }
-
-  // Iterates all non-zero inline 8-bit counters, if they are present.
-  // Calls AddPcIndxedAndCounterToFeatures on non-zero counters and zeroes them.
-  if (state.run_time_flags.use_pc_features ||
-      state.run_time_flags.use_counter_features) {
-    state.sancov_objects.ForEachNonZeroInlineCounter(
-        [](size_t idx, uint8_t counter_value) {
-          AddPcIndxedAndCounterToFeatures(idx, counter_value);
-        });
-  }
-}
-
 void RunnerCallbacks::GetSeeds(std::function<void(ByteSpan)> seed_callback) {
   seed_callback({0});
 }
@@ -624,7 +324,9 @@ static void RunOneInput(const uint8_t *data, size_t size,
   PrepareCoverage(/*full_clear=*/false);
   state.stats.prep_time_usec = UsecSinceLast();
   state.ResetTimers();
+  coverage_state.test_started = true;
   int target_return_value = callbacks.Execute({data, size}) ? 0 : -1;
+  coverage_state.test_started = false;
   state.stats.exec_time_usec = UsecSinceLast();
   CheckWatchdogLimits();
   if (fuzztest::internal::state.input_start_time.exchange(0) != 0) {
@@ -668,8 +370,8 @@ ReadOneInputExecuteItAndDumpCoverage(const char *input_path,
            input_path);
   FILE *features_file = fopen(features_file_path, "w");
   PrintErrorAndExitIf(features_file == nullptr, "can't open coverage file");
-  WriteFeaturesToFile(features_file, state.g_features.data(),
-                      state.g_features.size());
+  WriteFeaturesToFile(features_file, coverage_state.g_features.data(),
+                      coverage_state.g_features.size());
   fclose(features_file);
 }
 
@@ -699,9 +401,9 @@ static bool StartSendingOutputsToEngine(BlobSequence &outputs_blobseq) {
 // Returns the byte size of `g_features`.
 static size_t CopyFeatures(uint8_t *data, size_t capacity) {
   const size_t features_len_in_bytes =
-      state.g_features.size() * sizeof(feature_t);
+      coverage_state.g_features.size() * sizeof(feature_t);
   if (features_len_in_bytes > capacity) return 0;
-  memcpy(data, state.g_features.data(), features_len_in_bytes);
+  memcpy(data, coverage_state.g_features.data(), features_len_in_bytes);
   return features_len_in_bytes;
 }
 
@@ -709,16 +411,18 @@ static size_t CopyFeatures(uint8_t *data, size_t capacity) {
 // Returns true on success.
 static bool FinishSendingOutputsToEngine(BlobSequence &outputs_blobseq) {
   {
-    LockGuard lock(state.execution_result_override_mu);
+    LockGuard lock(coverage_state.execution_result_override_mu);
     bool has_overridden_execution_result = false;
-    if (state.execution_result_override != nullptr) {
-      RunnerCheck(state.execution_result_override->results().size() <= 1,
-                  "unexpected number of overridden execution results");
+    if (coverage_state.execution_result_override != nullptr) {
+      RunnerCheck(
+          coverage_state.execution_result_override->results().size() <= 1,
+          "unexpected number of overridden execution results");
       has_overridden_execution_result =
-          state.execution_result_override->results().size() == 1;
+          coverage_state.execution_result_override->results().size() == 1;
     }
     if (has_overridden_execution_result) {
-      const auto &result = state.execution_result_override->results()[0];
+      const auto &result =
+          coverage_state.execution_result_override->results()[0];
       return BatchResult::WriteOneFeatureVec(result.features().data(),
                                              result.features().size(),
                                              outputs_blobseq) &&
@@ -729,21 +433,23 @@ static bool FinishSendingOutputsToEngine(BlobSequence &outputs_blobseq) {
   }
 
   // Copy features to shared memory.
-  if (!BatchResult::WriteOneFeatureVec(
-          state.g_features.data(), state.g_features.size(), outputs_blobseq)) {
+  if (!BatchResult::WriteOneFeatureVec(coverage_state.g_features.data(),
+                                       coverage_state.g_features.size(),
+                                       outputs_blobseq)) {
     return false;
   }
 
   ExecutionMetadata metadata;
   // Copy the CMP traces to shared memory.
-  if (state.run_time_flags.use_auto_dictionary) {
+  if (coverage_state.run_time_flags.use_auto_dictionary) {
     bool append_failed = false;
-    state.ForEachTls([&metadata, &append_failed](ThreadLocalRunnerState &tls) {
-      if (!AppendCmpEntries(tls.cmp_trace2, metadata)) append_failed = true;
-      if (!AppendCmpEntries(tls.cmp_trace4, metadata)) append_failed = true;
-      if (!AppendCmpEntries(tls.cmp_trace8, metadata)) append_failed = true;
-      if (!AppendCmpEntries(tls.cmp_traceN, metadata)) append_failed = true;
-    });
+    coverage_state.ForEachTls(
+        [&metadata, &append_failed](ThreadLocalRunnerState &tls) {
+          if (!AppendCmpEntries(tls.cmp_trace2, metadata)) append_failed = true;
+          if (!AppendCmpEntries(tls.cmp_trace4, metadata)) append_failed = true;
+          if (!AppendCmpEntries(tls.cmp_trace8, metadata)) append_failed = true;
+          if (!AppendCmpEntries(tls.cmp_traceN, metadata)) append_failed = true;
+        });
     if (append_failed) return false;
   }
   if (!BatchResult::WriteMetadata(metadata, outputs_blobseq)) return false;
@@ -794,10 +500,12 @@ static int ExecuteInputsFromShmem(BlobSequence &inputs_blobseq,
 // Dumps the pc table to `output_path`.
 // Requires that state.main_object is already computed.
 static void DumpPcTable(const char *absl_nonnull output_path) {
-  PrintErrorAndExitIf(!state.main_object.IsSet(), "main_object is not set");
+  fprintf(stderr, "DumpPcTable %s\n", output_path);
+  PrintErrorAndExitIf(!coverage_state.main_object.IsSet(),
+                      "main_object is not set");
   FILE *output_file = fopen(output_path, "w");
   PrintErrorAndExitIf(output_file == nullptr, "can't open output file");
-  std::vector<PCInfo> pcs = state.sancov_objects.CreatePCTable();
+  std::vector<PCInfo> pcs = coverage_state.sancov_objects.CreatePCTable();
   // Dump the pc table.
   const auto data_size_in_bytes = pcs.size() * sizeof(PCInfo);
   auto num_bytes_written =
@@ -810,10 +518,11 @@ static void DumpPcTable(const char *absl_nonnull output_path) {
 // Dumps the control-flow table to `output_path`.
 // Requires that state.main_object is already computed.
 static void DumpCfTable(const char *absl_nonnull output_path) {
-  PrintErrorAndExitIf(!state.main_object.IsSet(), "main_object is not set");
+  PrintErrorAndExitIf(!coverage_state.main_object.IsSet(),
+                      "main_object is not set");
   FILE *output_file = fopen(output_path, "w");
   PrintErrorAndExitIf(output_file == nullptr, "can't open output file");
-  std::vector<uintptr_t> data = state.sancov_objects.CreateCfTable();
+  std::vector<uintptr_t> data = coverage_state.sancov_objects.CreateCfTable();
   size_t data_size_in_bytes = data.size() * sizeof(data[0]);
   // Dump the table.
   auto num_bytes_written =
@@ -828,7 +537,7 @@ static void DumpCfTable(const char *absl_nonnull output_path) {
 static void DumpDsoTable(const char *absl_nonnull output_path) {
   FILE *output_file = fopen(output_path, "w");
   RunnerCheck(output_file != nullptr, "DumpDsoTable: can't open output file");
-  DsoTable dso_table = state.sancov_objects.CreateDsoTable();
+  DsoTable dso_table = coverage_state.sancov_objects.CreateDsoTable();
   for (const auto &entry : dso_table) {
     fprintf(output_file, "%s %zd\n", entry.path.c_str(),
             entry.num_instrumented_pcs);
@@ -1026,7 +735,7 @@ static void MaybePopulateReversePcTable() {
   const char *pcs_file_path = state.GetStringFlag(":pcs_file_path=");
   if (!pcs_file_path) return;
   const auto pc_table = ReadBytesFromFilePath<PCInfo>(pcs_file_path);
-  state.reverse_pc_table.SetFromPCs(pc_table);
+  coverage_state.reverse_pc_table.SetFromPCs(pc_table);
 }
 
 // Create a fake reference to ForkServerCallMeVeryEarly() here so that the
@@ -1053,6 +762,7 @@ extern void RunnerInterceptor();
     &RunnerInterceptor;
 
 GlobalRunnerState::GlobalRunnerState() {
+  fprintf(stderr, "Centipede runner state constructor\n");
   // Make sure fork server is started if needed.
   ForkServerCallMeVeryEarly();
 
@@ -1064,8 +774,8 @@ GlobalRunnerState::GlobalRunnerState() {
   SetLimits();
 
   // Compute main_object.
-  main_object = GetDlInfo(state.GetStringFlag(":dl_path_suffix="));
-  if (!main_object.IsSet()) {
+  coverage_state.main_object = GetDlInfo(GetStringFlag(":dl_path_suffix="));
+  if (!coverage_state.main_object.IsSet()) {
     fprintf(
         stderr,
         "Failed to compute main_object. This may happen"
@@ -1073,7 +783,8 @@ GlobalRunnerState::GlobalRunnerState() {
   }
 
   // Dump the binary info tables.
-  if (state.HasFlag(":dump_binary_info:")) {
+  if (HasFlag(":dump_binary_info:")) {
+    fprintf(stderr, "Centipede runner state dump_binary_info\n");
     RunnerCheck(state.arg1 && state.arg2 && state.arg3,
                 "dump_binary_info requires 3 arguments");
     if (!state.arg1 || !state.arg2 || !state.arg3) _exit(EXIT_FAILURE);
@@ -1086,13 +797,16 @@ GlobalRunnerState::GlobalRunnerState() {
   MaybePopulateReversePcTable();
 
   // initialize the user defined section.
-  user_defined_begin = &__start___centipede_extra_features;
-  user_defined_end = &__stop___centipede_extra_features;
-  if (user_defined_begin && user_defined_end) {
+  coverage_state.user_defined_begin = &__start___centipede_extra_features;
+  coverage_state.user_defined_end = &__stop___centipede_extra_features;
+
+  feature_t *begin = coverage_state.user_defined_begin;
+  feature_t *end = coverage_state.user_defined_end;
+  if (begin && end) {
     fprintf(
         stderr,
         "section(\"__centipede_extra_features\") detected with %zd elements\n",
-        user_defined_end - user_defined_begin);
+        end - begin);
   }
 }
 
@@ -1100,7 +814,7 @@ GlobalRunnerState::~GlobalRunnerState() {
   // The process is winding down, but CentipedeRunnerMain did not run.
   // This means, the binary is standalone with its own main(), and we need to
   // report the coverage now.
-  if (!state.centipede_runner_main_executed && state.HasFlag(":shmem:")) {
+  if (!state.centipede_runner_main_executed && HasFlag(":shmem:")) {
     int exit_status = EXIT_SUCCESS;  // TODO(kcc): do we know our exit status?
     PostProcessCoverage(exit_status);
     SharedMemoryBlobSequence outputs_blobseq(state.arg2);
@@ -1108,14 +822,14 @@ GlobalRunnerState::~GlobalRunnerState() {
     FinishSendingOutputsToEngine(outputs_blobseq);
   }
   {
-    LockGuard lock(state.execution_result_override_mu);
-    if (state.execution_result_override != nullptr) {
-      delete state.execution_result_override;
-      state.execution_result_override = nullptr;
+    LockGuard lock(coverage_state.execution_result_override_mu);
+    if (coverage_state.execution_result_override != nullptr) {
+      delete coverage_state.execution_result_override;
+      coverage_state.execution_result_override = nullptr;
     }
   }
   // Always clean up detached TLSs to avoid leakage.
-  CleanUpDetachedTls();
+  coverage_state.CleanUpDetachedTls();
 }
 
 // If HasFlag(:shmem:), state.arg1 and state.arg2 are the names
@@ -1128,8 +842,9 @@ GlobalRunnerState::~GlobalRunnerState() {
 int RunnerMain(int argc, char **argv, RunnerCallbacks &callbacks) {
   state.centipede_runner_main_executed = true;
 
-  fprintf(stderr, "Centipede fuzz target runner; argv[0]: %s flags: %s\n",
-          argv[0], state.centipede_runner_flags);
+  fprintf(stderr,
+          "Centipede fuzz target runner; argv[0]: %s flags: %s\n arg1: %s",
+          argv[0], state.centipede_runner_flags, state.arg1);
 
   if (state.HasFlag(":dump_configuration:")) {
     DumpSerializedTargetConfigToFile(callbacks,
@@ -1155,10 +870,10 @@ int RunnerMain(int argc, char **argv, RunnerCallbacks &callbacks) {
       // We still pay for executing the coverage callbacks, but those will
       // return immediately.
       // TODO(kcc): do this more consistently, for all coverage types.
-      state.run_time_flags.use_cmp_features = false;
-      state.run_time_flags.use_pc_features = false;
-      state.run_time_flags.use_dataflow_features = false;
-      state.run_time_flags.use_counter_features = false;
+      coverage_state.run_time_flags.use_cmp_features = false;
+      coverage_state.run_time_flags.use_pc_features = false;
+      coverage_state.run_time_flags.use_dataflow_features = false;
+      coverage_state.run_time_flags.use_counter_features = false;
       // Mutation request.
       inputs_blobseq.Reset();
       state.byte_array_mutator =
@@ -1204,7 +919,8 @@ extern "C" void CentipedeSetRssLimit(size_t rss_limit_mb) {
 extern "C" void CentipedeSetStackLimit(size_t stack_limit_kb) {
   fprintf(stderr, "CentipedeSetStackLimit: changing stack_limit_kb to %zu\n",
           stack_limit_kb);
-  fuzztest::internal::state.run_time_flags.stack_limit_kb = stack_limit_kb;
+  fuzztest::internal::coverage_state.run_time_flags.stack_limit_kb =
+      stack_limit_kb;
 }
 
 extern "C" void CentipedeSetTimeoutPerInput(uint64_t timeout_per_input) {
@@ -1251,9 +967,11 @@ extern "C" void CentipedeEndExecutionBatch() {
 extern "C" void CentipedePrepareProcessing() {
   fuzztest::internal::PrepareCoverage(/*full_clear=*/!in_execution_batch);
   fuzztest::internal::state.ResetTimers();
+  fuzztest::internal::coverage_state.test_started = true;
 }
 
 extern "C" void CentipedeFinalizeProcessing() {
+  fuzztest::internal::coverage_state.test_started = false;
   fuzztest::internal::CheckWatchdogLimits();
   if (fuzztest::internal::state.input_start_time.exchange(0) != 0) {
     fuzztest::internal::PostProcessCoverage(/*target_return_value=*/0);
@@ -1274,40 +992,18 @@ extern "C" size_t CentipedeGetCoverageData(uint8_t *data, size_t capacity) {
 }
 
 extern "C" void CentipedeSetExecutionResult(const uint8_t *data, size_t size) {
-  using fuzztest::internal::state;
-  fuzztest::internal::LockGuard lock(state.execution_result_override_mu);
-  if (!state.execution_result_override)
-    state.execution_result_override = new fuzztest::internal::BatchResult();
-  state.execution_result_override->ClearAndResize(1);
+  using fuzztest::internal::coverage_state;
+  fuzztest::internal::LockGuard lock(
+      coverage_state.execution_result_override_mu);
+  if (!coverage_state.execution_result_override)
+    coverage_state.execution_result_override =
+        new fuzztest::internal::BatchResult();
+  coverage_state.execution_result_override->ClearAndResize(1);
   if (data == nullptr) return;
   // Removing const here should be fine as we don't write to `blobseq`.
   fuzztest::internal::BlobSequence blobseq(const_cast<uint8_t *>(data), size);
-  state.execution_result_override->Read(blobseq);
+  coverage_state.execution_result_override->Read(blobseq);
   fuzztest::internal::RunnerCheck(
-      state.execution_result_override->num_outputs_read() == 1,
+      coverage_state.execution_result_override->num_outputs_read() == 1,
       "Failed to set execution result from CentipedeSetExecutionResult");
 }
-
-extern "C" void CentipedeSetFailureDescription(const char *description) {
-  using fuzztest::internal::state;
-  if (state.failure_description_path == nullptr) return;
-  // Make sure that the write is atomic and only happens once.
-  [[maybe_unused]] static int write_once = [=] {
-    FILE *f = fopen(state.failure_description_path, "w");
-    if (f == nullptr) {
-      perror("FAILURE: fopen()");
-      return 0;
-    }
-    const auto len = strlen(description);
-    if (fwrite(description, 1, len, f) != len) {
-      perror("FAILURE: fwrite()");
-    }
-    if (fflush(f) != 0) {
-      perror("FAILURE: fflush()");
-    }
-    if (fclose(f) != 0) {
-      perror("FAILURE: fclose()");
-    }
-    return 0;
-  }();
-}
diff --git a/centipede/runner.h b/centipede/runner.h
index 8e4ff8eff..9d44b21d3 100644
--- a/centipede/runner.h
+++ b/centipede/runner.h
@@ -44,89 +44,17 @@
 
 namespace fuzztest::internal {
 
-// Like std::lock_guard, but for pthread_mutex_t.
-class LockGuard {
- public:
-  explicit LockGuard(pthread_mutex_t &mu) : mu_(mu) { pthread_mutex_lock(&mu); }
-  ~LockGuard() { pthread_mutex_unlock(&mu_); }
-
- private:
-  pthread_mutex_t &mu_;
-};
-
 // Flags derived from CENTIPEDE_RUNNER_FLAGS.
 // Flags used in instrumentation callbacks are bit-packed for efficiency.
 struct RunTimeFlags {
-  uint64_t path_level : 8;
-  uint64_t use_pc_features : 1;
-  uint64_t use_dataflow_features : 1;
-  uint64_t use_cmp_features : 1;
-  uint64_t callstack_level : 8;
-  uint64_t use_counter_features : 1;
-  uint64_t use_auto_dictionary : 1;
   std::atomic<uint64_t> timeout_per_input;
   uint64_t timeout_per_batch;
-  std::atomic<uint64_t> stack_limit_kb;
   std::atomic<uint64_t> rss_limit_mb;
   uint64_t crossover_level;
-  uint64_t skip_seen_features : 1;
   uint64_t ignore_timeout_reports : 1;
   uint64_t max_len;
 };
 
-// One such object is created in runner's TLS.
-// There is no CTOR, since we don't want to use the brittle and lazy TLS CTORs.
-// All data members are zero-initialized during thread creation.
-struct ThreadLocalRunnerState {
-  // Traces the memory comparison of `n` bytes at `s1` and `s2` called at
-  // `caller_pc` with `is_equal` indicating whether the two memory regions have
-  // equal contents. May add cmp features and auto-dictionary entries if
-  // enabled.
-  void TraceMemCmp(uintptr_t caller_pc, const uint8_t *s1, const uint8_t *s2,
-                   size_t n, bool is_equal);
-
-  // Intrusive doubly-linked list of TLS objects.
-  // Guarded by state.tls_list_mu.
-  ThreadLocalRunnerState *next, *prev;
-
-  // The pthread_create() interceptor calls OnThreadStart() before the thread
-  // callback. The main thread also calls OnThreadStart(). OnThreadStop() will
-  // be called when thread termination is detected internally - see runner.cc.
-  void OnThreadStart();
-  void OnThreadStop();
-
-  // Whether OnThreadStart() is called on this thread. This is used as a proxy
-  // of the readiness of the lower-level runtime.
-  bool started;
-
-  // Paths are thread-local, so we maintain the current bounded path here.
-  // We allow paths of up to 100, controlled at run-time via the "path_level".
-  static constexpr uint64_t kBoundedPathLength = 100;
-  HashedRingBuffer<kBoundedPathLength> path_ring_buffer;
-
-  // Value of SP in the top call frame of the thread, computed in OnThreadStart.
-  uintptr_t top_frame_sp;
-  // The lower bound of the stack region of this thread. 0 means unknown.
-  uintptr_t stack_region_low;
-  // Lowest observed value of SP.
-  uintptr_t lowest_sp;
-
-  // The (imprecise) call stack is updated by the PC callback.
-  CallStack<> call_stack;
-
-  // Cmp traces capture the arguments of CMP instructions, memcmp, etc.
-  // We have dedicated traces for 2-, 4-, and 8-byte comparison, and
-  // a catch-all `cmp_traceN` trace for memcmp, etc.
-  CmpTrace<2, 64> cmp_trace2;
-  CmpTrace<4, 64> cmp_trace4;
-  CmpTrace<8, 64> cmp_trace8;
-  CmpTrace<0, 64> cmp_traceN;
-
-  // Set this to true if the thread needs to be ignored in ForEachTLS.
-  // It should be always false if the state is in the global detached_tls_list.
-  bool ignore;
-};
-
 // One global object of this type is created by the runner at start up.
 // All data members will be initialized to zero, unless they have initializers.
 // Accesses to the subobjects should be fast, so we are trying to avoid
@@ -157,26 +85,13 @@ struct GlobalRunnerState {
   const char *arg1 = GetStringFlag(":arg1=");
   const char *arg2 = GetStringFlag(":arg2=");
   const char *arg3 = GetStringFlag(":arg3=");
-  // The path to a file where the runner may write the description of failure.
-  const char *failure_description_path =
-      GetStringFlag(":failure_description_path=");
 
   // Flags.
   RunTimeFlags run_time_flags = {
-      /*path_level=*/std::min(ThreadLocalRunnerState::kBoundedPathLength,
-                              HasIntFlag(":path_level=", 0)),
-      /*use_pc_features=*/HasFlag(":use_pc_features:"),
-      /*use_dataflow_features=*/HasFlag(":use_dataflow_features:"),
-      /*use_cmp_features=*/HasFlag(":use_cmp_features:"),
-      /*callstack_level=*/HasIntFlag(":callstack_level=", 0),
-      /*use_counter_features=*/HasFlag(":use_counter_features:"),
-      /*use_auto_dictionary=*/HasFlag(":use_auto_dictionary:"),
       /*timeout_per_input=*/HasIntFlag(":timeout_per_input=", 0),
       /*timeout_per_batch=*/HasIntFlag(":timeout_per_batch=", 0),
-      /*stack_limit_kb=*/HasIntFlag(":stack_limit_kb=", 0),
       /*rss_limit_mb=*/HasIntFlag(":rss_limit_mb=", 0),
       /*crossover_level=*/HasIntFlag(":crossover_level=", 50),
-      /*skip_seen_features=*/HasFlag(":skip_seen_features:"),
       /*ignore_timeout_reports=*/HasFlag(":ignore_timeout_reports:"),
       /*max_len=*/HasIntFlag(":max_len=", 4000),
   };
@@ -217,87 +132,6 @@ struct GlobalRunnerState {
     return strndup(value_beg, end - value_beg);
   }
 
-  pthread_mutex_t execution_result_override_mu = PTHREAD_MUTEX_INITIALIZER;
-  // If not nullptr, it points to a batch result with either zero or one
-  // execution. When an execution result present, it will be passed as the
-  // execution result of the current test input. The object is owned and cleaned
-  // up by the state, protected by execution_result_override_mu, and set by
-  // `CentipedeSetExecutionResult()`.
-  BatchResult *execution_result_override;
-
-  // Doubly linked list of TLSs of all live threads.
-  ThreadLocalRunnerState *tls_list;
-  // Doubly linked list of detached TLSs.
-  ThreadLocalRunnerState *detached_tls_list;
-  // Guards `tls_list` and `detached_tls_list`.
-  pthread_mutex_t tls_list_mu = PTHREAD_MUTEX_INITIALIZER;
-  // Iterates all TLS objects under tls_list_mu, except those with `ignore` set.
-  // Calls `callback()` on every TLS.
-  template <typename Callback>
-  void ForEachTls(Callback callback) {
-    LockGuard lock(tls_list_mu);
-    for (auto *it = tls_list; it; it = it->next) {
-      if (!it->ignore) callback(*it);
-    }
-    for (auto *it = detached_tls_list; it; it = it->next) {
-      callback(*it);
-    }
-  }
-
-  // Reclaims all TLSs in detached_tls_list and cleans up the list.
-  void CleanUpDetachedTls();
-
-  // Computed by DlInfo().
-  // Usually, the main object is the executable binary containing main()
-  // and most of the executable code (we assume that the target is
-  // built in mostly-static mode, i.e. -dynamic_mode=off).
-  // When the `dl_path_suffix` runner flag is provided, the main_object refers
-  // to the dynamic library (DSO) pointed to by this flag.
-  //
-  // Note: this runner currently does not support more than one instrumented
-  // DSO in the process, i.e. you either instrument the main binary, or one DSO.
-  // Supporting more than one DSO will require major changes,
-  // major added complexity, and potentially cause slowdown.
-  // There is currently no motivation for such a change.
-  DlInfo main_object;
-
-  // State for SanitizerCoverage.
-  // See https://clang.llvm.org/docs/SanitizerCoverage.html.
-  SanCovObjectArray sancov_objects;
-  // An arbitrarily large size.
-  static constexpr size_t kDataFlowFeatureSetSize = 1 << 18;
-  ConcurrentBitSet<kDataFlowFeatureSetSize> data_flow_feature_set{
-      absl::kConstInit};
-
-  // Tracing CMP instructions, capture events from these domains:
-  // kCMPEq, kCMPModDiff, kCMPHamming, kCMPModDiffLog, kCMPMsbEq.
-  // See https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow.
-  // An arbitrarily large size.
-  static constexpr size_t kCmpFeatureSetSize = 1 << 18;
-  // TODO(kcc): remove cmp_feature_set.
-  ConcurrentBitSet<kCmpFeatureSetSize> cmp_feature_set{absl::kConstInit};
-  ConcurrentBitSet<kCmpFeatureSetSize> cmp_eq_set{absl::kConstInit};
-  ConcurrentBitSet<kCmpFeatureSetSize> cmp_moddiff_set{absl::kConstInit};
-  ConcurrentBitSet<kCmpFeatureSetSize> cmp_hamming_set{absl::kConstInit};
-  ConcurrentBitSet<kCmpFeatureSetSize> cmp_difflog_set{absl::kConstInit};
-
-  // We think that call stack produces rich signal, so we give a few bits to it.
-  static constexpr size_t kCallStackFeatureSetSize = 1 << 24;
-  ConcurrentBitSet<kCallStackFeatureSetSize> callstack_set{absl::kConstInit};
-
-  // kMaxNumPcs is the maximum number of instrumented PCs in the binary.
-  // We can be generous here since the unused memory will not cost anything.
-  // `pc_counter_set` is a static byte set supporting up to kMaxNumPcs PCs.
-  static constexpr size_t kMaxNumPcs = 1 << 28;
-  TwoLayerConcurrentByteSet<kMaxNumPcs> pc_counter_set{absl::kConstInit};
-  // This is the actual number of PCs, aligned up to
-  // pc_counter_set::kSizeMultiple, computed at startup.
-  size_t actual_pc_counter_set_size_aligned;
-
-  // Initialized in CTOR from the __centipede_extra_features section.
-  feature_t *user_defined_begin;
-  feature_t *user_defined_end;
-
   // We use edge instrumentation w/ callbacks to implement bounded-path
   // coverage.
   // * The current PC is converted to an offset (a PC index).
@@ -313,19 +147,9 @@ struct GlobalRunnerState {
   // * Use call stacks instead of paths (via unwinding or other
   // instrumentation).
 
-  // An arbitrarily large size.
-  static constexpr size_t kPathBitSetSize = 1 << 25;
-  // Observed paths. The total number of observed paths for --path_level=N
-  // can be up to NumPCs**N.
-  // So, we make the bitset very large, but it may still saturate.
-  ConcurrentBitSet<kPathBitSetSize> path_feature_set{absl::kConstInit};
-
   // Execution stats for the currently executed input.
   ExecutionResult::Stats stats;
 
-  // Used by trace_pc instrumentation. Populated if `pcs_file_path` flag is set.
-  ReversePCTable reverse_pc_table;
-
   // CentipedeRunnerMain() sets this to true.
   bool centipede_runner_main_executed = false;
 
@@ -346,23 +170,9 @@ struct GlobalRunnerState {
 
   // The Watchdog thread sets this to true.
   std::atomic<bool> watchdog_thread_started;
-
-  // An arbitrarily large size.
-  static const size_t kMaxFeatures = 1 << 20;
-  // FeatureArray used to accumulate features from all sources.
-  FeatureArray<kMaxFeatures> g_features;
-
-  // Features that were seen before.
-  static constexpr size_t kSeenFeatureSetSize =
-      absl::bit_ceil(feature_domains::kLastDomain.end());
-  ConcurrentBitSet<kSeenFeatureSetSize> seen_features{absl::kConstInit};
 };
 
-extern GlobalRunnerState state;
-extern __thread ThreadLocalRunnerState tls;
-
-// Check for stack limit for the stack pointer `sp` in the current thread.
-void CheckStackLimit(uintptr_t sp);
+// extern GlobalRunnerState state;
 
 }  // namespace fuzztest::internal
 
diff --git a/centipede/runner_interceptors.cc b/centipede/runner_interceptors.cc
index 886c45080..5b1b9fdcf 100644
--- a/centipede/runner_interceptors.cc
+++ b/centipede/runner_interceptors.cc
@@ -22,7 +22,7 @@
 
 #include "absl/base/nullability.h"
 #include "absl/base/optimization.h"
-#include "./centipede/runner.h"
+#include "./centipede/coverage_state.h"
 
 using fuzztest::internal::tls;
 
diff --git a/centipede/runner_sancov.cc b/centipede/runner_sancov.cc
index 22435f4e8..aab585596 100644
--- a/centipede/runner_sancov.cc
+++ b/centipede/runner_sancov.cc
@@ -22,20 +22,20 @@
 #include <cstdio>
 
 #include "absl/base/nullability.h"
+#include "./centipede/coverage_state.h"
 #include "./centipede/feature.h"
 #include "./centipede/int_utils.h"
 #include "./centipede/pc_info.h"
 #include "./centipede/reverse_pc_table.h"
-#include "./centipede/runner.h"
 #include "./centipede/runner_dl_info.h"
 
 namespace fuzztest::internal {
 void RunnerSancov() {}  // to be referenced in runner.cc
 }  // namespace fuzztest::internal
 
+using fuzztest::internal::coverage_state;
 using fuzztest::internal::PCGuard;
 using fuzztest::internal::PCInfo;
-using fuzztest::internal::state;
 using fuzztest::internal::tls;
 
 // Tracing data flow.
@@ -65,34 +65,37 @@ using fuzztest::internal::tls;
 
 // NOTE: Enforce inlining so that `__builtin_return_address` works.
 ENFORCE_INLINE static void TraceLoad(void *addr) {
-  if (!state.run_time_flags.use_dataflow_features) return;
+  if (!coverage_state.run_time_flags.use_dataflow_features) return;
   auto caller_pc = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
   auto load_addr = reinterpret_cast<uintptr_t>(addr);
-  auto pc_offset = caller_pc - state.main_object.start_address;
-  if (pc_offset >= state.main_object.size) return;  // PC outside main obj.
-  auto addr_offset = load_addr - state.main_object.start_address;
-  if (addr_offset >= state.main_object.size) return;  // Not a global address.
-  state.data_flow_feature_set.set(fuzztest::internal::ConvertPcPairToNumber(
-      pc_offset, addr_offset, state.main_object.size));
+  auto pc_offset = caller_pc - coverage_state.main_object.start_address;
+  if (pc_offset >= coverage_state.main_object.size)
+    return;  // PC outside main obj.
+  auto addr_offset = load_addr - coverage_state.main_object.start_address;
+  if (addr_offset >= coverage_state.main_object.size)
+    return;  // Not a global address.
+  coverage_state.data_flow_feature_set.set(
+      fuzztest::internal::ConvertPcPairToNumber(
+          pc_offset, addr_offset, coverage_state.main_object.size));
 }
 
 // NOTE: Enforce inlining so that `__builtin_return_address` works.
 ENFORCE_INLINE static void TraceCmp(uint64_t Arg1, uint64_t Arg2) {
-  if (!state.run_time_flags.use_cmp_features) return;
+  if (!coverage_state.run_time_flags.use_cmp_features) return;
   auto caller_pc = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
-  auto pc_offset = caller_pc - state.main_object.start_address;
+  auto pc_offset = caller_pc - coverage_state.main_object.start_address;
   uintptr_t hash =
       fuzztest::internal::Hash64Bits(pc_offset) ^ tls.path_ring_buffer.hash();
   if (Arg1 == Arg2) {
-    state.cmp_eq_set.set(hash);
+    coverage_state.cmp_eq_set.set(hash);
   } else {
     hash <<= 6;  // ABTo* generate 6-bit numbers.
-    state.cmp_moddiff_set.set(hash |
-                              fuzztest::internal::ABToCmpModDiff(Arg1, Arg2));
-    state.cmp_hamming_set.set(hash |
-                              fuzztest::internal::ABToCmpHamming(Arg1, Arg2));
-    state.cmp_difflog_set.set(hash |
-                              fuzztest::internal::ABToCmpDiffLog(Arg1, Arg2));
+    coverage_state.cmp_moddiff_set.set(
+        hash | fuzztest::internal::ABToCmpModDiff(Arg1, Arg2));
+    coverage_state.cmp_hamming_set.set(
+        hash | fuzztest::internal::ABToCmpHamming(Arg1, Arg2));
+    coverage_state.cmp_difflog_set.set(
+        hash | fuzztest::internal::ABToCmpDiffLog(Arg1, Arg2));
   }
 }
 
@@ -114,19 +117,19 @@ void __sanitizer_cov_trace_const_cmp1(uint8_t Arg1, uint8_t Arg2) {
 NO_SANITIZE
 void __sanitizer_cov_trace_const_cmp2(uint16_t Arg1, uint16_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace2.Capture(Arg1, Arg2);
 }
 NO_SANITIZE
 void __sanitizer_cov_trace_const_cmp4(uint32_t Arg1, uint32_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace4.Capture(Arg1, Arg2);
 }
 NO_SANITIZE
 void __sanitizer_cov_trace_const_cmp8(uint64_t Arg1, uint64_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace8.Capture(Arg1, Arg2);
 }
 NO_SANITIZE
@@ -136,19 +139,19 @@ void __sanitizer_cov_trace_cmp1(uint8_t Arg1, uint8_t Arg2) {
 NO_SANITIZE
 void __sanitizer_cov_trace_cmp2(uint16_t Arg1, uint16_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace2.Capture(Arg1, Arg2);
 }
 NO_SANITIZE
 void __sanitizer_cov_trace_cmp4(uint32_t Arg1, uint32_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace4.Capture(Arg1, Arg2);
 }
 NO_SANITIZE
 void __sanitizer_cov_trace_cmp8(uint64_t Arg1, uint64_t Arg2) {
   TraceCmp(Arg1, Arg2);
-  if (Arg1 != Arg2 && state.run_time_flags.use_auto_dictionary)
+  if (Arg1 != Arg2 && coverage_state.run_time_flags.use_auto_dictionary)
     tls.cmp_trace8.Capture(Arg1, Arg2);
 }
 // TODO(kcc): [impl] handle switch.
@@ -159,7 +162,7 @@ void __sanitizer_cov_trace_switch(uint64_t Val, uint64_t *Cases) {}
 // -fsanitize-coverage=inline-8bit-counters is used.
 // See https://clang.llvm.org/docs/SanitizerCoverage.html#inline-8bit-counters
 void __sanitizer_cov_8bit_counters_init(uint8_t *beg, uint8_t *end) {
-  state.sancov_objects.Inline8BitCountersInit(beg, end);
+  coverage_state.sancov_objects.Inline8BitCountersInit(beg, end);
 }
 
 // https://clang.llvm.org/docs/SanitizerCoverage.html#pc-table
@@ -169,13 +172,13 @@ void __sanitizer_cov_8bit_counters_init(uint8_t *beg, uint8_t *end) {
 // We currently do not support more than one sancov-instrumented DSO.
 void __sanitizer_cov_pcs_init(const PCInfo *absl_nonnull beg,
                               const PCInfo *end) {
-  state.sancov_objects.PCInfoInit(beg, end);
+  coverage_state.sancov_objects.PCInfoInit(beg, end);
 }
 
 // https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-control-flow
 // This function is called at the DSO init time.
 void __sanitizer_cov_cfs_init(const uintptr_t *beg, const uintptr_t *end) {
-  state.sancov_objects.CFSInit(beg, end);
+  coverage_state.sancov_objects.CFSInit(beg, end);
 }
 
 // Updates the state of the paths, `path_level > 0`.
@@ -183,7 +186,7 @@ void __sanitizer_cov_cfs_init(const uintptr_t *beg, const uintptr_t *end) {
 // of __sanitizer_cov_trace_pc_guard.
 __attribute__((noinline)) static void HandlePath(uintptr_t normalized_pc) {
   uintptr_t hash = tls.path_ring_buffer.push(normalized_pc);
-  state.path_feature_set.set(hash);
+  coverage_state.path_feature_set.set(hash);
 }
 
 // Handles one observed PC.
@@ -194,8 +197,8 @@ __attribute__((noinline)) static void HandlePath(uintptr_t normalized_pc) {
 // With __sanitizer_cov_trace_pc this is PC itself, normalized by subtracting
 // the DSO's dynamic start address.
 static ENFORCE_INLINE void HandleOnePc(PCGuard pc_guard) {
-  if (!state.run_time_flags.use_pc_features) return;
-  state.pc_counter_set.SaturatedIncrement(pc_guard.pc_index);
+  if (!coverage_state.run_time_flags.use_pc_features) return;
+  coverage_state.pc_counter_set.SaturatedIncrement(pc_guard.pc_index);
 
   if (pc_guard.is_function_entry) {
     uintptr_t sp = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
@@ -209,14 +212,15 @@ static ENFORCE_INLINE void HandleOnePc(PCGuard pc_guard) {
       tls.lowest_sp = sp;
       fuzztest::internal::CheckStackLimit(sp);
     }
-    if (state.run_time_flags.callstack_level != 0) {
+    if (coverage_state.run_time_flags.callstack_level != 0) {
       tls.call_stack.OnFunctionEntry(pc_guard.pc_index, sp);
-      state.callstack_set.set(tls.call_stack.Hash());
+      coverage_state.callstack_set.set(tls.call_stack.Hash());
     }
   }
 
   // path features.
-  if (state.run_time_flags.path_level != 0) HandlePath(pc_guard.pc_index);
+  if (coverage_state.run_time_flags.path_level != 0)
+    HandlePath(pc_guard.pc_index);
 }
 
 // Caller PC is the PC of the call instruction.
@@ -235,9 +239,9 @@ static uintptr_t ReturnAddressToCallerPc(uintptr_t return_address) {
 
 // Sets `actual_pc_counter_set_size_aligned` to `size`, properly aligned up.
 static void UpdatePcCounterSetSizeAligned(size_t size) {
-  constexpr size_t kAlignment = state.pc_counter_set.kSizeMultiple;
+  constexpr size_t kAlignment = coverage_state.pc_counter_set.kSizeMultiple;
   constexpr size_t kMask = kAlignment - 1;
-  state.actual_pc_counter_set_size_aligned = (size + kMask) & ~kMask;
+  coverage_state.actual_pc_counter_set_size_aligned = (size + kMask) & ~kMask;
 }
 
 // MainObjectLazyInit() and helpers allow us to initialize state.main_object
@@ -260,11 +264,11 @@ static void UpdatePcCounterSetSizeAligned(size_t size) {
 // b) it will slowdown the hot function.
 static pthread_once_t main_object_lazy_init_once = PTHREAD_ONCE_INIT;
 static void MainObjectLazyInitOnceCallback() {
-  state.main_object =
-      fuzztest::internal::GetDlInfo(state.GetStringFlag(":dl_path_suffix="));
+  coverage_state.main_object = fuzztest::internal::GetDlInfo(
+      coverage_state.GetStringFlag(":dl_path_suffix="));
   fprintf(stderr, "MainObjectLazyInitOnceCallback %zx\n",
-          state.main_object.start_address);
-  UpdatePcCounterSetSizeAligned(state.reverse_pc_table.NumPcs());
+          coverage_state.main_object.start_address);
+  UpdatePcCounterSetSizeAligned(coverage_state.reverse_pc_table.NumPcs());
 }
 
 __attribute__((noinline)) static void MainObjectLazyInit() {
@@ -281,15 +285,15 @@ __attribute__((noinline)) static void MainObjectLazyInit() {
 // this variant.
 void __sanitizer_cov_trace_pc() {
   uintptr_t pc = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
-  if (!state.main_object.start_address ||
-      !state.actual_pc_counter_set_size_aligned) {
+  if (!coverage_state.main_object.start_address ||
+      !coverage_state.actual_pc_counter_set_size_aligned) {
     // Don't track coverage at all before the PC table is initialized.
-    if (state.reverse_pc_table.NumPcs() == 0) return;
+    if (coverage_state.reverse_pc_table.NumPcs() == 0) return;
     MainObjectLazyInit();
   }
-  pc -= state.main_object.start_address;
+  pc -= coverage_state.main_object.start_address;
   pc = ReturnAddressToCallerPc(pc);
-  const auto pc_guard = state.reverse_pc_table.GetPCGuard(pc);
+  const auto pc_guard = coverage_state.reverse_pc_table.GetPCGuard(pc);
   // TODO(kcc): compute is_function_entry for this case.
   if (pc_guard.IsValid()) HandleOnePc(pc_guard);
 }
@@ -297,8 +301,9 @@ void __sanitizer_cov_trace_pc() {
 // This function is called at the DSO init time.
 void __sanitizer_cov_trace_pc_guard_init(PCGuard *absl_nonnull start,
                                          PCGuard *stop) {
-  state.sancov_objects.PCGuardInit(start, stop);
-  UpdatePcCounterSetSizeAligned(state.sancov_objects.NumInstrumentedPCs());
+  coverage_state.sancov_objects.PCGuardInit(start, stop);
+  UpdatePcCounterSetSizeAligned(
+      coverage_state.sancov_objects.NumInstrumentedPCs());
 }
 
 // This function is called on every instrumented edge.