Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,30 @@ jobs:
sleep 2
printf '*1\r\n$4\r\nPING\r\n' | nc 127.0.0.1 6391 | head -n 1 | grep PONG
docker rm -f pomai-cache-ci


soak-short:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
- run: cmake --build build -j
- run: ./build/pomai_cache_server --port 6390 &
- run: sleep 2
- run: python3 tests/soak/pomai_cache_soak.py --port 6390 --duration 120
- run: pkill pomai_cache_server || true

replay-smoke:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
- run: cmake --build build -j
- run: ./build/pomai_cache_server --port 6392 &
- run: sleep 2
- run: ./build/pomai_cache_replay --trace traces/mini_hotset.trace --port 6392 --json out/replay_ci.json --csv out/replay_ci.csv
- uses: actions/upload-artifact@v4
with:
name: replay-artifacts
path: out/replay_ci.*
- run: pkill pomai_cache_server || true
22 changes: 22 additions & 0 deletions .github/workflows/perf-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: perf-nightly
on:
workflow_dispatch:
schedule:
- cron: "0 3 * * *"

jobs:
replay-perf:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
- run: cmake --build build -j
- run: ./build/pomai_cache_server --port 6393 &
- run: sleep 2
- run: ./build/pomai_cache_replay --trace traces/mini_hotset.trace --port 6393 --json out/perf_mini.json --csv out/perf_mini.csv
- run: ./build/pomai_cache_replay --trace traces/ttlheavy.trace --port 6393 --json out/perf_ttl.json --csv out/perf_ttl.csv
- uses: actions/upload-artifact@v4
with:
name: perf-nightly
path: out/perf_*
- run: pkill pomai_cache_server || true
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ if(NOT WIN32)

add_executable(pomai_cache_netbench bench/pomai_cache_netbench.cpp)
target_link_libraries(pomai_cache_netbench PRIVATE pomai_cache_core)

add_executable(pomai_cache_replay bench/pomai_cache_replay.cpp)
target_link_libraries(pomai_cache_replay PRIVATE pomai_cache_core)
endif()

add_executable(pomai_cache_bench bench/pomai_cache_bench.cpp)
Expand All @@ -41,6 +44,14 @@ if(BUILD_TESTING)
add_test(NAME test_engine COMMAND test_engine)
add_test(NAME test_resp COMMAND test_resp)

add_executable(test_chaos tests/test_chaos.cpp)
target_link_libraries(test_chaos PRIVATE pomai_cache_core mini_catch_main)
add_test(NAME test_chaos COMMAND test_chaos)

add_executable(test_canary tests/test_canary.cpp)
target_link_libraries(test_canary PRIVATE pomai_cache_core mini_catch_main)
add_test(NAME test_canary COMMAND test_canary)

if(NOT WIN32)
add_executable(test_integration tests/test_integration.cpp)
target_link_libraries(test_integration PRIVATE pomai_cache_core mini_catch_main)
Expand Down
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,26 @@ Bench reports per workload and policy:
- max concurrent connections enforced
- slow-client protection via bounded output buffer
- bounded per-tick TTL cleanup


## Trace replay example

```bash
./build/pomai_cache_replay --trace traces/mini_hotset.trace --port 6379 --scale 2.0 --json out/replay_summary.json --csv out/replay_timeseries.csv
```

## Canary rollout example

```bash
redis-cli -p 6379 CONFIG SET POLICY.CANARY_PCT 10
redis-cli -p 6379 CONFIG SET PARAMS config/policy_params.json
redis-cli -p 6379 INFO
```

## Slowlog and diagnostics

```bash
redis-cli -p 6379 SLOWLOG GET 10
redis-cli -p 6379 DEBUG DUMPSTATS /tmp/pomai_dump.txt
redis-cli -p 6379 TRACE STREAM
```
160 changes: 160 additions & 0 deletions bench/pomai_cache_replay.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#include "pomai_cache/resp.hpp"

#include <algorithm>
#include <arpa/inet.h>
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <netinet/in.h>
#include <numeric>
#include <regex>
#include <sstream>
#include <string>
#include <sys/socket.h>
#include <thread>
#include <unistd.h>
#include <vector>

namespace {
struct TraceOp { std::uint64_t ts_ms{0}; std::string op; std::size_t key_hash{0}; std::size_t value_size{0}; };

bool extract_u64(const std::string &line, const std::string &key, std::uint64_t &out) {
std::regex re("\\\"" + key + "\\\"\\s*:\\s*([0-9]+)");
std::smatch m;
if (!std::regex_search(line, m, re)) return false;
out = std::stoull(m[1].str());
return true;
}

bool extract_str(const std::string &line, const std::string &key, std::string &out) {
std::regex re("\\\"" + key + "\\\"\\s*:\\s*\\\"([^\\\"]*)\\\"");
std::smatch m;
if (!std::regex_search(line, m, re)) return false;
out = m[1].str();
return true;
}

std::string mkcmd(const TraceOp &op) {
std::string key = "k" + std::to_string(op.key_hash % 1000);
if (op.op == "GET") return "*2\r\n$3\r\nGET\r\n$" + std::to_string(key.size()) + "\r\n" + key + "\r\n";
if (op.op == "DEL") return "*2\r\n$3\r\nDEL\r\n$" + std::to_string(key.size()) + "\r\n" + key + "\r\n";
std::string value(op.value_size > 0 ? op.value_size : 16, 'x');
return "*3\r\n$3\r\nSET\r\n$" + std::to_string(key.size()) + "\r\n" + key + "\r\n$" + std::to_string(value.size()) + "\r\n" + value + "\r\n";
}

std::string percentile(const std::vector<double> &v, double p) {
if (v.empty()) return "0";
std::vector<double> s = v;
std::sort(s.begin(), s.end());
std::size_t idx = static_cast<std::size_t>(std::floor((s.size() - 1) * p));
std::ostringstream os;
os << s[idx];
return os.str();
}

std::string send_cmd(int fd, const std::string &cmd) {
send(fd, cmd.data(), cmd.size(), 0);
char buf[4096];
ssize_t n = recv(fd, buf, sizeof(buf), 0);
if (n <= 0) return {};
return std::string(buf, static_cast<std::size_t>(n));
}
} // namespace

int main(int argc, char **argv) {
std::string trace_path = "traces/mini_hotset.trace";
std::string out_json = "out/replay_summary.json";
std::string out_csv = "out/replay_timeseries.csv";
int port = 6379;
double scale = 1.0;
for (int i = 1; i < argc; ++i) {
std::string a = argv[i];
if (a == "--trace" && i + 1 < argc) trace_path = argv[++i];
else if (a == "--port" && i + 1 < argc) port = std::stoi(argv[++i]);
else if (a == "--scale" && i + 1 < argc) scale = std::stod(argv[++i]);
else if (a == "--json" && i + 1 < argc) out_json = argv[++i];
else if (a == "--csv" && i + 1 < argc) out_csv = argv[++i];
}

std::ifstream in(trace_path);
if (!in.is_open()) {
std::cerr << "trace file not found\n";
return 1;
}
std::vector<TraceOp> ops;
for (std::string line; std::getline(in, line);) {
TraceOp op;
std::uint64_t v = 0;
extract_u64(line, "ts_ms", op.ts_ms);
extract_u64(line, "key_hash", v); op.key_hash = static_cast<std::size_t>(v);
extract_u64(line, "value_size", v); op.value_size = static_cast<std::size_t>(v);
extract_str(line, "op", op.op);
if (!op.op.empty()) ops.push_back(op);
}

int fd = socket(AF_INET, SOCK_STREAM, 0);
sockaddr_in addr{};
addr.sin_family = AF_INET;
addr.sin_port = htons(port);
inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr);
if (connect(fd, reinterpret_cast<sockaddr *>(&addr), sizeof(addr)) < 0) {
std::cerr << "connect failed\n";
return 2;
}

auto before = send_cmd(fd, "*1\r\n$4\r\nINFO\r\n");
std::vector<double> lats;
std::vector<std::string> ts_rows;
std::uint64_t hits = 0;
std::uint64_t gets = 0;
const auto replay_start = std::chrono::steady_clock::now();
std::uint64_t base_ts = ops.empty() ? 0 : ops.front().ts_ms;

for (std::size_t i = 0; i < ops.size(); ++i) {
if (i > 0 && scale > 0.0) {
auto target_ms = static_cast<std::uint64_t>((ops[i].ts_ms - base_ts) / scale);
auto now_ms = static_cast<std::uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - replay_start).count());
if (target_ms > now_ms) std::this_thread::sleep_for(std::chrono::milliseconds(target_ms - now_ms));
}
auto st = std::chrono::steady_clock::now();
auto resp = send_cmd(fd, mkcmd(ops[i]));
auto en = std::chrono::steady_clock::now();
double us = std::chrono::duration<double, std::micro>(en - st).count();
lats.push_back(us);
if (ops[i].op == "GET") {
++gets;
if (resp.rfind("$-1", 0) != 0) ++hits;
}
if (i % 50 == 0) {
ts_rows.push_back(std::to_string(i) + "," + std::to_string(us));
}
}
auto after = send_cmd(fd, "*1\r\n$4\r\nINFO\r\n");
close(fd);

double seconds = std::chrono::duration<double>(std::chrono::steady_clock::now() - replay_start).count();
double ops_s = seconds > 0 ? static_cast<double>(ops.size()) / seconds : 0.0;
double hit_rate = gets > 0 ? static_cast<double>(hits) / static_cast<double>(gets) : 0.0;

std::ofstream jout(out_json);
jout << "{\n";
jout << " \"trace\": \"" << trace_path << "\",\n";
jout << " \"ops\": " << ops.size() << ",\n";
jout << " \"ops_per_sec\": " << ops_s << ",\n";
jout << " \"p50_us\": " << percentile(lats, 0.50) << ",\n";
jout << " \"p95_us\": " << percentile(lats, 0.95) << ",\n";
jout << " \"p99_us\": " << percentile(lats, 0.99) << ",\n";
jout << " \"p999_us\": " << percentile(lats, 0.999) << ",\n";
jout << " \"hit_rate\": " << hit_rate << "\n";
jout << "}\n";

std::ofstream csv(out_csv);
csv << "op_index,latency_us\n";
for (const auto &r : ts_rows) csv << r << "\n";

std::cout << "ops/s=" << ops_s << " p50=" << percentile(lats, 0.50) << " p95=" << percentile(lats, 0.95)
<< " p99=" << percentile(lats, 0.99) << " p999=" << percentile(lats, 0.999) << " hit_rate=" << hit_rate << "\n";
std::cout << "INFO_BEFORE\n" << before << "\nINFO_AFTER\n" << after << "\n";
return 0;
}
7 changes: 7 additions & 0 deletions docs/CANARY_ROLLOUT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Canary rollout
1. Load existing params as control (LKG persists automatically).
2. Enable canary split: `CONFIG SET POLICY.CANARY_PCT 10`.
3. Reload params file with candidate values via `CONFIG SET PARAMS <path>`.

Server tracks control vs candidate hit-rate and p99 latency and auto-rolls back to LKG if guardrails are violated.
INFO includes canary fields and last rollback event.
8 changes: 8 additions & 0 deletions docs/PERF_TOOLING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Perf tooling
- `scripts/perf/perf_record.sh <cmd>`
- `scripts/perf/flamegraph.sh`
- `scripts/perf/tsan_build.sh`
- `scripts/perf/asan_build.sh`
- `scripts/perf/heap_profile.md`

Nightly perf workflow replays reference traces and uploads summary artifacts.
5 changes: 5 additions & 0 deletions docs/SOAK_CHAOS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Soak + chaos
Run soak:
`python3 tests/soak/pomai_cache_soak.py --port 6379 --duration 180`

Chaos coverage uses `tests/test_chaos.cpp` for churn with mixed set/get/del/expire ensuring no memory overflow.
10 changes: 10 additions & 0 deletions docs/TRACING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Tracing
Tracing is off by default.

Enable:
- `CONFIG SET TRACE.PATH /tmp/pomai.trace.jsonl`
- `CONFIG SET TRACE.SAMPLE_RATE 0.1`
- `CONFIG SET TRACE.ENABLED yes`

Trace lines are JSONL with hashed keys, op type, value size, ttl class, owner, hit/miss and latency bucket.
Use `TRACE STREAM` for capped in-memory recent trace lines.
26 changes: 26 additions & 0 deletions include/pomai_cache/engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <cstdint>
#include <functional>
#include <deque>
#include <memory>
#include <optional>
#include <queue>
Expand Down Expand Up @@ -46,6 +47,10 @@ class Engine {

std::string info() const;
bool reload_params(const std::string &path, std::string *err = nullptr);
void set_canary_pct(std::uint64_t pct);
std::uint64_t canary_pct() const { return canary_pct_; }
bool rollback_to_lkg(std::string *err = nullptr);
bool dump_stats(const std::string &path, std::string *err = nullptr) const;

const EngineStats &stats() const { return stats_; }
std::size_t memory_used() const { return memory_used_; }
Expand All @@ -70,6 +75,15 @@ class Engine {
void evict_until_fit();
double owner_miss_cost(const std::string &owner) const;
std::size_t bucket_for(std::size_t size) const;
bool is_canary_key(const std::string &key) const;
void maybe_evaluate_canary();
static std::uint64_t p99_from_samples(const std::deque<std::uint64_t> &samples);

struct CohortStats {
std::uint64_t gets{0};
std::uint64_t hits{0};
std::deque<std::uint64_t> latency_us;
};

EngineConfig cfg_;
std::unique_ptr<IEvictionPolicy> policy_;
Expand All @@ -84,6 +98,18 @@ class Engine {
std::size_t memory_used_{0};
std::size_t bucket_used_{0};
std::size_t expiration_backlog_{0};
std::uint64_t canary_pct_{0};
bool canary_active_{false};
PolicyParams control_params_{};
PolicyParams canary_params_{};
std::string lkg_path_{".pomai_lkg_params.json"};
CohortStats control_stats_;
CohortStats canary_stats_;
TimePoint canary_start_{Clock::now()};
TimePoint last_guardrail_eval_{Clock::now()};
std::uint64_t baseline_evictions_{0};
std::uint64_t rollback_events_{0};
std::string last_canary_event_{"none"};
};

std::unique_ptr<IEvictionPolicy> make_policy_by_name(const std::string &mode);
Expand Down
4 changes: 4 additions & 0 deletions scripts/perf/asan_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
cmake -S . -B build-asan -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS='-fsanitize=address,undefined -fno-omit-frame-pointer' -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=address,undefined'
cmake --build build-asan -j
3 changes: 3 additions & 0 deletions scripts/perf/flamegraph.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
set -euo pipefail
perf script | stackcollapse-perf.pl | flamegraph.pl > flamegraph.svg
7 changes: 7 additions & 0 deletions scripts/perf/heap_profile.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Heap profiling
Use jemalloc or mimalloc profiling if available.

Example (jemalloc):
```
MALLOC_CONF=prof:true,lg_prof_sample:19,prof_prefix:jeprof ./build/pomai_cache_server
```
3 changes: 3 additions & 0 deletions scripts/perf/perf_record.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
set -euo pipefail
perf record -F 99 -g -- "$@"
4 changes: 4 additions & 0 deletions scripts/perf/tsan_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
cmake -S . -B build-tsan -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS='-fsanitize=thread -fno-omit-frame-pointer' -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=thread'
cmake --build build-tsan -j
Loading
Loading