diff --git a/packages/ucache_bench/run.py b/packages/ucache_bench/run.py
index d0624163..a283bf5f 100644
--- a/packages/ucache_bench/run.py
+++ b/packages/ucache_bench/run.py
@@ -235,6 +235,10 @@ def run_server(args: argparse.Namespace) -> None:
         server_cmd.append(
             f"--rpc_num_cpu_worker_threads={args.rpc_num_cpu_worker_threads}"
         )
+    if args.rpc_socket_max_reads_per_event != 1:
+        server_cmd.append(
+            f"--rpc_socket_max_reads_per_event={args.rpc_socket_max_reads_per_event}"
+        )
 
     # CPU pinning configuration
     if args.cpu_pinning_enabled:
@@ -461,6 +465,12 @@ def init_parser() -> argparse.ArgumentParser:
         default=1,
         help="Number of CPU worker threads for ThriftServer",
     )
+    server_parser.add_argument(
+        "--rpc-socket-max-reads-per-event",
+        type=int,
+        default=1,
+        help="Max reads per socket per event loop iteration (production uses 1, ThriftServer default is 16)",
+    )
 
     # CPU pinning configuration
     server_parser.add_argument(
diff --git a/packages/ucache_bench/server/UcacheBenchRpcServer.cpp b/packages/ucache_bench/server/UcacheBenchRpcServer.cpp
index 941164d6..559964fe 100644
--- a/packages/ucache_bench/server/UcacheBenchRpcServer.cpp
+++ b/packages/ucache_bench/server/UcacheBenchRpcServer.cpp
@@ -27,6 +27,12 @@ DEFINE_uint32(
     1,
     "Number of CPU worker threads for ThriftServer. "
     "Production ucache uses 1. These handle CPU-bound work separate from IO");
+DEFINE_uint32(
+    rpc_socket_max_reads_per_event,
+    1,
+    "Max reads per socket per event loop iteration. "
+    "Production ucache uses 1. ThriftServer default is 16. "
+    "Higher values let a single connection deliver more requests per epoll wakeup");
 
 // CPU pinning configuration flags
 DEFINE_bool(
@@ -163,7 +169,8 @@ apache::thrift::ThriftServer& UcacheBenchRpcServer::addThriftServer() {
   // Prevent single connection from monopolizing an IO thread's event loop.
   // Without this, a few hot connections can starve others, limiting
   // multi-client scalability.
-  thriftServer_->setSocketMaxReadsPerEvent(1);
+  thriftServer_->setSocketMaxReadsPerEvent(
+      FLAGS_rpc_socket_max_reads_per_event);
 
   // Disable timeouts — let clients control timing, same as production ucache.
   thriftServer_->setQueueTimeout(std::chrono::milliseconds(0));
@@ -176,8 +183,10 @@ apache::thrift::ThriftServer& UcacheBenchRpcServer::addThriftServer() {
   thriftServer_->disableActiveRequestsTracking();
 
   XLOG(INFO) << "ThriftServer configured with "
-             << FLAGS_rpc_num_cpu_worker_threads << " CPU worker threads and "
-             << numAcceptorThreads << " acceptor threads";
+             << FLAGS_rpc_num_cpu_worker_threads << " CPU worker threads, "
+             << numAcceptorThreads << " acceptor threads, "
+             << "socketMaxReadsPerEvent="
+             << FLAGS_rpc_socket_max_reads_per_event;
 
   return *thriftServer_;
 }
diff --git a/packages/ucache_bench/server/UcacheBenchServer.cpp b/packages/ucache_bench/server/UcacheBenchServer.cpp
index 65fb2cb0..6ea0f9c7 100644
--- a/packages/ucache_bench/server/UcacheBenchServer.cpp
+++ b/packages/ucache_bench/server/UcacheBenchServer.cpp
@@ -45,6 +45,11 @@ void UcacheBenchServer::setupCacheLib() {
   cacheConfig.setAccessConfig(
       {config_.hash_power, config_.hashtable_lock_power});
 
+  // Configure number of CacheLib shards if specified
+  if (config_.cachelib_num_shards > 0) {
+    cacheConfig.setNumShards(config_.cachelib_num_shards);
+  }
+
   // Generate alloc sizes (factor 1.25, min allocation size)
   // This provides a good distribution of allocation classes for cache items
   // Max alloc size increased to 64KB to support production traffic distribution