facebookresearch · excelle08 · Mar 25, 2026
diff --git a/benchpress/config/jobs.yml b/benchpress/config/jobs.yml
@@ -1045,6 +1045,7 @@
         - '--timeout-buffer={timeout_buffer}'
         - '--disable-tls={disable_tls}'
         - '--smart-nanosleep={smart_nanosleep}'
+        - '--memory-file={memory_file}'
         - '--real'
       vars:
         - 'interface_name=eth0'
@@ -1056,6 +1057,7 @@
         - 'timeout_buffer=120'
         - 'disable_tls=0'
         - 'smart_nanosleep=0'
+        - 'memory_file='
     client:
       args:
         - 'client'
@@ -1123,6 +1125,7 @@
     - '--client-wait-after-warmup={client_wait_after_warmup}'
     - '--disable-tls={disable_tls}'
     - '--smart-nanosleep={smart_nanosleep}'
+    - '--memory-file={memory_file}'
     - '--real'
   vars:
     - 'num_servers=0'
@@ -1144,6 +1147,7 @@
     - 'client_wait_after_warmup=5'
     - 'disable_tls=0'
     - 'smart_nanosleep=0'
+    - 'memory_file='
   hooks:
     - hook: tao_instruction
     - hook: copymove
@@ -1179,6 +1183,7 @@
     - '--num-slow-threads={num_slow_threads}'
     - '--num-fast-threads={num_fast_threads}'
     - '--num-client-threads={num_client_threads}'
+    - '--memory-file={memory_file}'
   vars:
     - 'num_servers=0'
     - 'memsize=0'
@@ -1196,6 +1201,7 @@
     - 'num_slow_threads=0'
     - 'num_fast_threads=0'
     - 'num_client_threads=0'
+    - 'memory_file='
   hooks:
     - hook: copymove
       options:
@@ -1233,6 +1239,7 @@
     - '--test-timeout-buffer={test_timeout_buffer}'
     - '--postprocessing-timeout-buffer={postprocessing_timeout_buffer}'
     - '--poll-interval={poll_interval}'
+    - '--memory-file={memory_file}'
   vars:
     - 'num_servers=0'
     - 'memsize=0.5'
@@ -1254,6 +1261,7 @@
     - 'test_timeout_buffer=0'
     - 'postprocessing_timeout_buffer=60'
     - 'poll_interval=0.2'
+    - 'memory_file='
   hooks:
     - hook: copymove
       options:
@@ -1290,6 +1298,7 @@
     - '--conns-per-server-core={conns_per_server_core}'
     - '--stats-interval={stats_interval}'
     - '--disable-tls={disable_tls}'
+    - '--memory-file={memory_file}'
     - '--real'
   vars:
     - 'num_servers=0'
@@ -1308,6 +1317,7 @@
     - 'conns_per_server_core=85'
     - 'stats_interval=5000'
     - 'disable_tls=0'
+    - 'memory_file='
   hooks:
     - hook: tao_instruction
     - hook: copymove

diff --git a/packages/tao_bench/README.md b/packages/tao_bench/README.md
@@ -313,6 +313,96 @@ the guide for the `tao_bench_autoscale` job. Just replace the job name.
 A good run of this workload should have about 75~80% of CPU utilization in the steady
 state, of which 25~30% should be in userspace.
 
+## Memory file for fast warm restart
+
+TaoBench supports a memory file option that allows the server to persist its
+cache state across runs. On the first run, TaoBench fills the cache from scratch
+(cold start). On subsequent runs with the same memory file, the server reloads
+the pre-filled cache instantly, skipping the lengthy warmup phase.
+
+This uses memcached's native `-e` flag, which mmaps a file for slab storage.
+On graceful shutdown (SIGUSR1), memcached saves a `.meta` restart file. On the
+next startup with the same memory file, memcached restores all cached items
+from the file.
+
+### Usage
+
+Specify the `memory_file` parameter pointing to a path on a tmpfs filesystem
+(e.g. `/dev/shm`):
+
+```bash
+./benchpress_cli.py run tao_bench_autoscale -i '{"memory_file": "/dev/shm/tao_bench_mem"}'
+```
+
+On the first run, TaoBench will create per-instance memory files at
+`/dev/shm/tao_bench_mem.0`, `/dev/shm/tao_bench_mem.1`, etc. When the run
+completes, the server is shut down with SIGUSR1 which saves the restart
+metadata (a small `.meta` file alongside each memory file).
+
+On subsequent runs with the same `memory_file` path, the server loads all
+cached items from the memory files, drastically reducing warmup time.
+
+### /dev/shm sizing
+
+The memory files are stored in `/dev/shm` (tmpfs), which defaults to 50% of
+system RAM. When using large `memsize` values, `/dev/shm` may be too small.
+TaoBench automatically expands `/dev/shm` when needed (requires root), but
+you can also do it manually:
+
+```bash
+mount -o remount,size=800G /dev/shm
+```
+
+### Parameters
+
+  - `memory_file`: Path prefix for memory files. Each server instance gets a
+  suffixed file (e.g. `/dev/shm/tao_bench_mem.0`). Default is empty (disabled).
+
+## Auto-warmup detection
+
+When enabled, TaoBench monitors server statistics during the warmup phase and
+automatically signals clients to stop warmup early once the server reaches a
+warmed-up state. The `warmup_time` parameter becomes the maximum warmup
+duration rather than a fixed duration.
+
+### How it works
+
+The server opens a TCP control port (`port_number_start + 1000`, default 12211)
+and monitors the server log output in real time. Warmup is considered complete
+when:
+
+1. Hit ratio reaches >= 95% of `target_hit_ratio` (default 0.9, threshold 0.855)
+2. QPS stabilizes (coefficient of variation < 5%) over a 2-minute rolling window
+
+When both conditions are met for all server instances, the control port responds
+`READY` to client polls. Clients poll the control port every 5 seconds during
+warmup, and terminate the warmup phase early when the server reports `READY`.
+
+### Usage
+
+Set `auto_warmup` to 1:
+
+```bash
+./benchpress_cli.py run tao_bench_autoscale -i '{"auto_warmup": 1}'
+```
+
+This is most useful combined with the memory file option:
+
+```bash
+# Second run with pre-warmed cache + auto-warmup detection
+./benchpress_cli.py run tao_bench_autoscale -i '{"memory_file": "/dev/shm/tao_bench_mem", "auto_warmup": 1}'
+```
+
+The generated client instructions will automatically include the `control_port`
+parameter so clients can poll the server's warmup status.
+
+### Parameters
+
+  - `auto_warmup`: Set to 1 to enable auto-warmup detection. Default is 0 (disabled)
+  for `tao_bench_autoscale`, 1 (enabled) for `tao_bench_autoscale_v2_beta`.
+  - `target_hit_ratio`: Target hit ratio for warmup completion detection. Default
+  is 0.9. Warmup is considered complete when hit ratio reaches 95% of this value.
+
 ## Advanced job: `tao_bench_custom`
 
 **NOTE**: We recommend using `tao_bench_autoscale` to start the server as this job

diff --git a/packages/tao_bench/args_utils.py b/packages/tao_bench/args_utils.py
@@ -166,6 +166,16 @@ def add_common_server_args(server_parser: ArgumentParser) -> List[Tuple[str, str
         help="poll interval in seconds for process completion detection; "
         + "if > 0, use polling mechanism instead of fixed timeout",
     )
+    server_parser.add_argument(
+        "--memory-file",
+        type=str,
+        nargs="?",
+        const="",
+        default="",
+        help="path to memory file for memcached -e flag. "
+        + "Enables persistent memory backing via mmap. On graceful shutdown (SIGUSR1), "
+        + "memcached saves state to this file for fast warm restart on subsequent runs.",
+    )
     server_parser.add_argument("--real", action="store_true", help="for real")
 
     return get_opt_strings(server_parser)

diff --git a/packages/tao_bench/run.py b/packages/tao_bench/run.py
@@ -9,6 +9,7 @@
 import pathlib
 import re
 import shlex
+import signal
 import subprocess
 import sys
 import threading
@@ -62,6 +63,7 @@ def run_cmd(
     cmd: List[str],
     timeout=None,
     for_real=True,
+    graceful_signal=None,
 ) -> str:
     print(" ".join(cmd))
     if for_real:
@@ -72,18 +74,40 @@ def run_cmd(
         try:
             proc.wait(timeout=timeout)
         except subprocess.TimeoutExpired:
-            print(f"Process timeout expired, terminating process {proc.pid}...")
-            proc.terminate()
-            try:
-                # Give the process 5 seconds to terminate gracefully
-                proc.wait(timeout=5)
-                print(f"Process {proc.pid} terminated gracefully")
-            except subprocess.TimeoutExpired:
-                # If it still doesn't terminate, force kill it
-                print(f"Process {proc.pid} didn't respond to SIGTERM, force killing...")
-                proc.kill()
-                proc.wait()
-                print(f"Process {proc.pid} killed successfully")
+            if graceful_signal is not None:
+                print(
+                    f"Process timeout expired, sending graceful signal "
+                    f"{graceful_signal} to process {proc.pid}..."
+                )
+                os.kill(proc.pid, graceful_signal)
+                try:
+                    proc.wait(timeout=15)
+                    print(f"Process {proc.pid} exited after graceful signal")
+                    return
+                except subprocess.TimeoutExpired:
+                    print(
+                        f"Process {proc.pid} didn't exit after graceful signal, "
+                        f"force killing..."
+                    )
+                    proc.kill()
+                    proc.wait()
+                    print(f"Process {proc.pid} killed successfully")
+            else:
+                print(f"Process timeout expired, terminating process {proc.pid}...")
+                proc.terminate()
+                try:
+                    # Give the process 5 seconds to terminate gracefully
+                    proc.wait(timeout=5)
+                    print(f"Process {proc.pid} terminated gracefully")
+                except subprocess.TimeoutExpired:
+                    # If it still doesn't terminate, force kill it
+                    print(
+                        f"Process {proc.pid} didn't respond to SIGTERM, "
+                        f"force killing..."
+                    )
+                    proc.kill()
+                    proc.wait()
+                    print(f"Process {proc.pid} killed successfully")
 
 
 def profile_server():
@@ -221,6 +245,9 @@ def run_server(args):
         "-o",
         ",".join(extended_options),
     ]
+    if args.memory_file:
+        server_cmd += ["-e", args.memory_file]
+
     if "DCPERF_PERF_RECORD" in os.environ and os.environ["DCPERF_PERF_RECORD"] == "1":
         profiler_wait_time = (
             args.warmup_time + args.timeout_buffer + SERVER_PROFILING_DELAY
@@ -236,7 +263,8 @@ def run_server(args):
         os.environ["LD_LIBRARY_PATH"] = os.path.join(TAO_BENCH_DIR, "build-deps/lib")
 
     timeout = args.warmup_time + args.test_time + args.timeout_buffer
-    run_cmd(server_cmd, timeout, args.real)
+    graceful_sig = signal.SIGUSR1 if args.memory_file else None
+    run_cmd(server_cmd, timeout, args.real, graceful_signal=graceful_sig)
 
     if "DCPERF_PERF_RECORD" in os.environ and os.environ["DCPERF_PERF_RECORD"] == "1":
         t_prof.cancel()