From 7be7909c07f885b80ddd5249eb40e3ecb32c5b07 Mon Sep 17 00:00:00 2001 From: Alik Kurdyukov Date: Mon, 23 Feb 2026 16:59:19 +0400 Subject: [PATCH 1/2] cache disable implemented --- CLAUDE.md | 3 + .../checklists/requirements.md | 35 ++++ specs/003-channel-cache-config/data-model.md | 43 +++++ specs/003-channel-cache-config/plan.md | 99 ++++++++++ specs/003-channel-cache-config/quickstart.md | 66 +++++++ specs/003-channel-cache-config/research.md | 44 +++++ specs/003-channel-cache-config/spec.md | 78 ++++++++ specs/003-channel-cache-config/tasks.md | 173 ++++++++++++++++++ src/checker.rs | 48 ++--- src/config.rs | 40 ++++ src/main.rs | 1 + tests/cache_config_test.rs | 94 ++++++++++ tests/common/mod.rs | 21 +++ 13 files changed, 723 insertions(+), 22 deletions(-) create mode 100644 specs/003-channel-cache-config/checklists/requirements.md create mode 100644 specs/003-channel-cache-config/data-model.md create mode 100644 specs/003-channel-cache-config/plan.md create mode 100644 specs/003-channel-cache-config/quickstart.md create mode 100644 specs/003-channel-cache-config/research.md create mode 100644 specs/003-channel-cache-config/spec.md create mode 100644 specs/003-channel-cache-config/tasks.md create mode 100644 tests/cache_config_test.rs diff --git a/CLAUDE.md b/CLAUDE.md index 99214e6..7349941 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,6 +4,8 @@ Auto-generated from all feature plans. Last updated: 2025-10-28 ## Active Technologies - Rust 1.75+ (edition 2024) + estcontainers 0.27 (new), tokio 1.x, tonic 0.14, existing mock-grpc-backend Docker image (002-testcontainers-integration) +- Rust 1.75+ (edition 2024) + okio 1.x, tonic 0.14.2, dashmap 6.1.0, clap 4.5, serde 1.0, toml 0.9.8, prometheus 0.14 (003-channel-cache-config) +- N/A (in-memory DashMap cache, no persistence) (003-channel-cache-config) - Rust 1.75+ + okio (async runtime), tonic (gRPC client for backend checks), prometheus (metrics), serde (config/logging) (001-core-agent) @@ -23,6 +25,7 @@ cargo test [ONLY COMMANDS FOR ACTIVE TECHNOLOGIES][ONLY COMMANDS FOR ACTIVE TECH Rust 1.75+: Follow standard conventions ## Recent Changes +- 003-channel-cache-config: Added Rust 1.75+ (edition 2024) + okio 1.x, tonic 0.14.2, dashmap 6.1.0, clap 4.5, serde 1.0, toml 0.9.8, prometheus 0.14 - 002-testcontainers-integration: Added Rust 1.75+ (edition 2024) + estcontainers 0.27 (new), tokio 1.x, tonic 0.14, existing mock-grpc-backend Docker image - 001-core-agent: Added Rust 1.75+ + okio (async runtime), tonic (gRPC client for backend checks), prometheus (metrics), serde (config/logging) diff --git a/specs/003-channel-cache-config/checklists/requirements.md b/specs/003-channel-cache-config/checklists/requirements.md new file mode 100644 index 0000000..c7f8ea7 --- /dev/null +++ b/specs/003-channel-cache-config/checklists/requirements.md @@ -0,0 +1,35 @@ +# Specification Quality Checklist: Configurable Channel Cache + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-02-23 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- All items pass validation. Spec is ready for `/speckit.clarify` or `/speckit.plan`. +- The feature is straightforward (on/off toggle for existing behavior) with clear boundaries and no ambiguity. diff --git a/specs/003-channel-cache-config/data-model.md b/specs/003-channel-cache-config/data-model.md new file mode 100644 index 0000000..8d862a7 --- /dev/null +++ b/specs/003-channel-cache-config/data-model.md @@ -0,0 +1,43 @@ +# Data Model: Configurable Channel Cache + +**Feature**: 003-channel-cache-config +**Date**: 2026-02-23 + +## Modified Entities + +### AgentConfig + +Existing configuration struct extended with one new field. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| server_port | u16 | 5555 | TCP port for Agent Text Protocol server | +| server_bind_address | String | "0.0.0.0" | Bind address for agent server | +| grpc_connect_timeout_ms | u64 | 1000 | gRPC connection timeout in milliseconds | +| grpc_rpc_timeout_ms | u64 | 1500 | gRPC RPC timeout in milliseconds | +| **grpc_channel_cache_enabled** | **bool** | **true** | **Whether to cache and reuse gRPC channels across health checks** | +| metrics_port | u16 | 9090 | HTTP port for Prometheus metrics | +| metrics_bind_address | String | "0.0.0.0" | Bind address for metrics server | +| log_level | LogLevel | Info | Logging verbosity level | +| log_format | LogFormat | Json | Logging output format | + +**New field highlighted in bold.** + +### Configuration Sources + +| Source | Key/Flag | Example | +|--------|----------|---------| +| Environment variable | `HAPROXY_AGENT_GRPC_CHANNEL_CACHE` | `HAPROXY_AGENT_GRPC_CHANNEL_CACHE=false` | +| TOML config file | `grpc_channel_cache_enabled` | `grpc_channel_cache_enabled = false` | +| CLI argument | `--grpc-channel-cache` / `--no-grpc-channel-cache` | `--no-grpc-channel-cache` | + +### Precedence + +Environment variable < TOML config file < CLI argument (unchanged from existing behavior). + +## Unchanged Entities + +- **BackendChannelKey**: No changes (server, port, ssl_flag) +- **GrpcHealthChecker**: Internal struct unchanged; behavior modified by reading `config.grpc_channel_cache_enabled` +- **HealthCheckRequest / HealthCheckResponse**: No changes +- **Metrics**: No new metrics; `GRPC_CHANNELS_ACTIVE` reports 0 when caching disabled diff --git a/specs/003-channel-cache-config/plan.md b/specs/003-channel-cache-config/plan.md new file mode 100644 index 0000000..235552b --- /dev/null +++ b/specs/003-channel-cache-config/plan.md @@ -0,0 +1,99 @@ +# Implementation Plan: Configurable Channel Cache + +**Branch**: `003-channel-cache-config` | **Date**: 2026-02-23 | **Spec**: [spec.md](spec.md) +**Input**: Feature specification from `/specs/003-channel-cache-config/spec.md` + +## Summary + +Add a boolean configuration option `grpc_channel_cache_enabled` (default: `true`) to `AgentConfig` that controls whether `GrpcHealthChecker` caches and reuses gRPC channels across health checks. When disabled, each health check creates a fresh channel and the `GRPC_CHANNELS_ACTIVE` metric remains at 0. The option follows existing configuration patterns: environment variable, TOML file, and CLI argument with standard precedence. + +## Technical Context + +**Language/Version**: Rust 1.75+ (edition 2024) +**Primary Dependencies**: tokio 1.x, tonic 0.14.2, dashmap 6.1.0, clap 4.5, serde 1.0, toml 0.9.8, prometheus 0.14 +**Storage**: N/A (in-memory DashMap cache, no persistence) +**Testing**: cargo test + testcontainers 0.27 (integration tests with mock gRPC backend) +**Target Platform**: Linux server / Docker container +**Project Type**: Single Rust binary +**Performance Goals**: No degradation when caching enabled (existing behavior). Acceptable increased latency per check when caching disabled (operator's explicit choice). +**Constraints**: Total gRPC timeout < 2000ms (HAProxy default). No new dependencies required. +**Scale/Scope**: Single config field addition, 3 files modified (config.rs, checker.rs, tests/common/mod.rs), 1 new test file. + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +| Principle | Status | Notes | +|-----------|--------|-------| +| I. Agent Pattern | PASS | Single config option, no new external dependencies, follows existing config patterns (env/file/CLI) | +| II. Integration-Heavy Testing | PASS | New integration test will verify cache-disabled behavior with real mock backend via testcontainers | +| III. Observability | PASS | `GRPC_CHANNELS_ACTIVE` metric correctly reports 0 when caching disabled; startup log includes new config value | +| IV. HAProxy Protocol Compliance | PASS | No protocol changes; agent still returns up/down via Agent Check Protocol | +| V. Simplicity & Reliability | PASS | Minimal change: one boolean field, conditional logic in one method. No abstractions, no new patterns. YAGNI-compliant. | + +No violations. No entries needed in Complexity Tracking table. + +## Project Structure + +### Documentation (this feature) + +```text +specs/003-channel-cache-config/ +├── plan.md # This file +├── research.md # Phase 0 output +├── data-model.md # Phase 1 output +├── quickstart.md # Phase 1 output +└── tasks.md # Phase 2 output (created by /speckit.tasks) +``` + +### Source Code (repository root) + +```text +src/ +├── checker.rs # MODIFY: conditional cache logic in get_or_create_channel +├── config.rs # MODIFY: add grpc_channel_cache_enabled field + env/CLI/validation +├── lib.rs # NO CHANGE +├── logger.rs # NO CHANGE +├── main.rs # MODIFY: log new config value at startup +├── metrics.rs # NO CHANGE +├── protocol.rs # NO CHANGE +└── server.rs # NO CHANGE + +tests/ +├── common/mod.rs # MODIFY: add start_agent_with_config helper accepting AgentConfig +├── integration_test.rs # NO CHANGE +├── resilience_test.rs # NO CHANGE +└── cache_config_test.rs # NEW: tests for cache-enabled/disabled behavior +``` + +**Structure Decision**: Existing single-project layout preserved. Changes touch 3 source files (config.rs, checker.rs, main.rs), 1 test utility (common/mod.rs), and add 1 test file (cache_config_test.rs). + +## Complexity Tracking + +No violations to justify. + +## Design Decisions + +### D1: Config field name + +**Decision**: `grpc_channel_cache_enabled` (bool, default `true`) +**Rationale**: Follows existing naming convention (`grpc_connect_timeout_ms`, `grpc_rpc_timeout_ms`). Positive boolean (`enabled`) avoids double-negative confusion (`no_cache = false`). +**Env var**: `HAPROXY_AGENT_GRPC_CHANNEL_CACHE` (values: `true`/`false`) +**CLI flag**: `--grpc-channel-cache` / `--no-grpc-channel-cache` (clap boolean flag) +**TOML key**: `grpc_channel_cache_enabled = true` + +### D2: Implementation approach in checker.rs + +**Decision**: Conditional bypass in `get_or_create_channel` — when caching is disabled, skip cache lookup/insert and always create a fresh channel. +**Rationale**: Minimal code change. The channel creation logic is identical regardless of caching. Only the DashMap interactions are skipped. +**Alternative rejected**: Separate `CachedChecker` / `UncachedChecker` types — over-engineered for a boolean toggle. + +### D3: DashMap instantiation when caching disabled + +**Decision**: Still create the DashMap but never use it. The `Arc` is a zero-cost allocation when empty. +**Rationale**: Avoids `Option>` complexity. An empty DashMap has negligible memory footprint. + +### D4: Test strategy + +**Decision**: New `tests/cache_config_test.rs` with integration tests using testcontainers. Add `start_agent_with_config` to `tests/common/mod.rs` to allow passing custom `AgentConfig`. +**Rationale**: Constitution requires integration tests. Existing tests use default config and must remain unchanged. New helper enables config customization without breaking existing test patterns. diff --git a/specs/003-channel-cache-config/quickstart.md b/specs/003-channel-cache-config/quickstart.md new file mode 100644 index 0000000..add5af5 --- /dev/null +++ b/specs/003-channel-cache-config/quickstart.md @@ -0,0 +1,66 @@ +# Quickstart: Configurable Channel Cache + +**Feature**: 003-channel-cache-config + +## What Changed + +A new configuration option `grpc_channel_cache_enabled` controls whether the agent caches gRPC channels between health checks. By default, caching is enabled (existing behavior). Setting it to `false` forces the agent to create a fresh gRPC connection for every health check. + +## How to Use + +### Disable channel caching via environment variable + +```bash +HAPROXY_AGENT_GRPC_CHANNEL_CACHE=false ./haproxy-grpc-agent +``` + +### Disable channel caching via TOML config file + +```toml +# agent.toml +grpc_channel_cache_enabled = false +``` + +```bash +./haproxy-grpc-agent -c agent.toml +``` + +### Disable channel caching via CLI argument + +```bash +./haproxy-grpc-agent --no-grpc-channel-cache +``` + +### Keep default behavior (caching enabled) + +No change needed. The default is `true`. All existing configurations continue to work without modification. + +## Files Modified + +| File | Change | +|------|--------| +| `src/config.rs` | New `grpc_channel_cache_enabled` field in `AgentConfig`, env var parsing, CLI arg, default function | +| `src/checker.rs` | Conditional cache bypass in `get_or_create_channel` | +| `src/main.rs` | Log new config value at startup | +| `tests/common/mod.rs` | New `start_agent_with_config` helper | +| `tests/cache_config_test.rs` | New integration tests for cache-disabled behavior | + +## Verification + +Run all tests: + +```bash +cargo test +``` + +Run only the new cache config tests: + +```bash +cargo test --test cache_config_test +``` + +Verify with clippy: + +```bash +cargo clippy +``` diff --git a/specs/003-channel-cache-config/research.md b/specs/003-channel-cache-config/research.md new file mode 100644 index 0000000..477fe23 --- /dev/null +++ b/specs/003-channel-cache-config/research.md @@ -0,0 +1,44 @@ +# Research: Configurable Channel Cache + +**Feature**: 003-channel-cache-config +**Date**: 2026-02-23 + +## R1: Boolean config field patterns in clap + serde + +**Decision**: Use `#[serde(default = "default_true")]` for TOML/serde and `#[arg(long, default_value_t = true)]` with `--no-grpc-channel-cache` negation for clap CLI. + +**Rationale**: clap 4.x supports boolean flags with `--flag` / `--no-flag` syntax via `#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]`. For the environment variable, parse `"true"` / `"false"` strings matching the existing pattern used for other env vars in `load_from_env`. + +**Alternatives considered**: +- `clap::ArgAction::SetTrue` / `SetFalse` — doesn't support default `true` cleanly +- Separate `--disable-channel-cache` flag — inconsistent with positive-bool convention + +## R2: DashMap behavior when unused + +**Decision**: Instantiate `DashMap::new()` even when caching is disabled; never insert into it. + +**Rationale**: An empty `DashMap` allocates minimal memory (just the internal shard array headers). Using `Option` would add `.as_ref().unwrap()` noise throughout `get_or_create_channel` for no meaningful benefit. The empty map is effectively free. + +**Alternatives considered**: +- `Option>` — adds unwrap complexity, `None` path, conditional metric reporting +- Trait-based strategy pattern — over-engineered for a boolean toggle + +## R3: Channel lifecycle when caching is disabled + +**Decision**: Create a fresh `Channel` for each `check_backend_internal` call. The channel is used for the single RPC and then dropped (Rust ownership ensures cleanup). + +**Rationale**: tonic `Channel` implements `Drop` which closes the underlying HTTP/2 connection. When the channel goes out of scope after the health check completes, the connection is cleaned up. No explicit teardown needed. + +**Alternatives considered**: +- Channel pool with size=0 — unnecessary abstraction +- Explicit `channel.close()` — not available in tonic's API; Drop handles it + +## R4: Test approach for verifying cache bypass + +**Decision**: Use two consecutive health checks to the same backend with caching disabled. Verify both succeed (functional correctness). Verify `GRPC_CHANNELS_ACTIVE` metric stays at 0 (behavioral verification that caching is bypassed). + +**Rationale**: Directly inspecting the DashMap internals would require exposing implementation details. The metric-based approach tests the observable behavior without coupling tests to internal data structures. + +**Alternatives considered**: +- Expose `channel_cache.len()` as public API — leaks implementation details +- Mock the DashMap — impractical with concrete type, and constitution favors integration tests diff --git a/specs/003-channel-cache-config/spec.md b/specs/003-channel-cache-config/spec.md new file mode 100644 index 0000000..e94f0fc --- /dev/null +++ b/specs/003-channel-cache-config/spec.md @@ -0,0 +1,78 @@ +# Feature Specification: Configurable Channel Cache + +**Feature Branch**: `003-channel-cache-config` +**Created**: 2026-02-23 +**Status**: Draft +**Input**: User description: "Current implementation of GrpcHealthChecker has internal cache of channels. Please make is configurable so the user may disable caching with config option" + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Disable Channel Caching via Configuration (Priority: P1) + +As an operator deploying the HAProxy gRPC agent, I want to disable the internal gRPC channel cache so that every health check creates a fresh connection to the backend. This is useful in environments where cached connections may become stale silently (e.g., behind load balancers that silently drop idle connections) or during debugging when I want to verify that each check independently establishes connectivity. + +**Why this priority**: This is the core request — the ability to toggle channel caching on or off. Without this, the feature has no value. + +**Independent Test**: Can be fully tested by setting the cache-disable option, running multiple health checks against the same backend, and verifying that a new connection is established for each check (no channel reuse). + +**Acceptance Scenarios**: + +1. **Given** the configuration has channel caching disabled, **When** the agent performs two consecutive health checks to the same backend, **Then** each check creates a new gRPC channel (no channel is retrieved from cache). +2. **Given** the configuration has channel caching disabled, **When** the agent starts up, **Then** no channel cache data structure is populated during operation. +3. **Given** the configuration has channel caching enabled (default), **When** the agent performs two consecutive health checks to the same backend, **Then** the second check reuses the channel created by the first check (existing behavior preserved). + +--- + +### User Story 2 - Configure Channel Caching via All Supported Methods (Priority: P2) + +As an operator, I want to control channel caching through any of the existing configuration methods (environment variable, config file, or CLI argument) so that I can use whichever method fits my deployment workflow. + +**Why this priority**: Consistency with existing configuration patterns ensures operators don't need to learn new mechanisms. Less critical than the core toggle itself. + +**Independent Test**: Can be tested by setting the cache option via each configuration method independently and verifying the agent respects the setting in each case. + +**Acceptance Scenarios**: + +1. **Given** the environment variable for channel caching is set to disabled, **When** the agent starts, **Then** channel caching is disabled. +2. **Given** the TOML config file has channel caching set to disabled, **When** the agent starts, **Then** channel caching is disabled. +3. **Given** a CLI argument disables channel caching, **When** the agent starts, **Then** channel caching is disabled regardless of other configuration sources. +4. **Given** no configuration is provided for channel caching, **When** the agent starts, **Then** channel caching is enabled (backward-compatible default). + +--- + +### Edge Cases + +- What happens when channel caching is disabled and the backend is under high load? Each request creates a new connection, which may increase connection overhead. This is expected and acceptable — the operator has explicitly chosen this trade-off. +- What happens to the active channels metric when caching is disabled? The metric should consistently report 0 cached channels. +- What happens if the configuration is changed from cached to uncached (or vice versa) between restarts? The agent should respect the current configuration at startup; there is no hot-reload requirement. + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: System MUST provide a configuration option to enable or disable gRPC channel caching. +- **FR-002**: Channel caching MUST be enabled by default to preserve backward compatibility with existing deployments. +- **FR-003**: The channel caching option MUST be configurable via environment variable, TOML config file, and CLI argument, following the existing configuration precedence (env < config file < CLI). +- **FR-004**: When channel caching is disabled, the system MUST create a new gRPC channel for every health check request. +- **FR-005**: When channel caching is disabled, the cached channels metric MUST report 0. +- **FR-006**: When channel caching is enabled, the system MUST behave identically to the current implementation (channels are cached and reused, with readiness validation). + +### Key Entities + +- **Channel Cache Setting**: A boolean configuration value indicating whether gRPC channels should be cached and reused across health checks or created fresh for each request. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: Operators can disable channel caching with a single configuration change and observe that every health check creates a fresh connection. +- **SC-002**: Enabling channel caching (default) produces identical behavior to the current system with no regression in check latency or connection reuse. +- **SC-003**: The configuration option is available through all three existing configuration methods (environment variable, config file, CLI argument). +- **SC-004**: The active channels metric accurately reflects 0 cached channels when caching is disabled. + +## Assumptions + +- The configuration is read at startup and does not change during runtime (no hot-reload). +- Disabling caching may increase connection overhead and latency per check; this is an accepted trade-off chosen by the operator. +- The existing configuration precedence (environment variable < config file < CLI argument) applies to this new option as well. +- No additional cache eviction strategies (e.g., TTL, max size) are in scope for this feature; this is strictly an on/off toggle. diff --git a/specs/003-channel-cache-config/tasks.md b/specs/003-channel-cache-config/tasks.md new file mode 100644 index 0000000..7c0ccd1 --- /dev/null +++ b/specs/003-channel-cache-config/tasks.md @@ -0,0 +1,173 @@ +# Tasks: Configurable Channel Cache + +**Input**: Design documents from `/specs/003-channel-cache-config/` +**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, quickstart.md + +**Tests**: Integration tests are included per constitution principle II (Integration-Heavy Testing). + +**Organization**: Tasks are grouped by user story to enable independent implementation and testing of each story. + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (e.g., US1, US2) +- Include exact file paths in descriptions + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: No new project structure needed. This feature modifies existing files only. Phase 1 covers the test infrastructure needed by both user stories. + +- [x] T001 Add `start_agent_with_config` helper function to `tests/common/mod.rs` that accepts an `AgentConfig` parameter, binds to port 0, and returns `(JoinHandle, SocketAddr)` — mirrors existing `start_agent()` but with custom config + +**Checkpoint**: Test infrastructure ready for user story implementation. + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Add the `grpc_channel_cache_enabled` field to `AgentConfig` with serde/default support. This is required by both US1 (checker behavior) and US2 (multi-source config). + +**⚠️ CRITICAL**: No user story work can begin until this phase is complete. + +- [x] T002 Add `default_grpc_channel_cache_enabled` function returning `true` in `src/config.rs` +- [x] T003 Add `grpc_channel_cache_enabled: bool` field to `AgentConfig` struct with `#[serde(default = "default_grpc_channel_cache_enabled")]` in `src/config.rs` +- [x] T004 Update `AgentConfig::default()` impl to include `grpc_channel_cache_enabled: default_grpc_channel_cache_enabled()` in `src/config.rs` +- [x] T005 Add `grpc_channel_cache_enabled` field to all existing `AgentConfig` literals in unit tests in `src/config.rs` (tests: `test_config_validation_valid`, `test_config_validation_valid_custom_ports`, `test_config_validation_port_conflict`, `test_config_validation_invalid_server_port_zero`, `test_config_validation_invalid_metrics_port_zero`, `test_config_validation_invalid_connect_timeout_zero`, `test_config_validation_invalid_rpc_timeout_zero`) +- [x] T006 Log `grpc_channel_cache_enabled` config value at startup in `src/main.rs` tracing::info block + +**Checkpoint**: Config field exists with default `true`, compiles, existing tests pass (`cargo test`). + +--- + +## Phase 3: User Story 1 — Disable Channel Caching via Configuration (Priority: P1) 🎯 MVP + +**Goal**: When `grpc_channel_cache_enabled` is `false`, every health check creates a fresh gRPC channel. When `true` (default), existing caching behavior is preserved unchanged. + +**Independent Test**: Set `grpc_channel_cache_enabled = false` in config, run multiple health checks to same backend, verify all succeed and no channels are cached. + +### Implementation for User Story 1 + +- [x] T007 [US1] Modify `GrpcHealthChecker::get_or_create_channel` in `src/checker.rs`: when `self.config.grpc_channel_cache_enabled` is `false`, skip cache lookup (lines 53–70) and skip cache insert (line 114), and skip metric update on insert (line 117) — go directly to channel creation and return the fresh channel +- [x] T008 [US1] Ensure `GRPC_CHANNELS_ACTIVE` metric is never incremented when caching is disabled — verify no `metrics::GRPC_CHANNELS_ACTIVE.set(...)` calls occur on the no-cache path in `src/checker.rs` + +### Integration Tests for User Story 1 + +- [x] T009 [US1] Create `tests/cache_config_test.rs` with test `test_health_check_cache_disabled`: start mock backend (SERVING), start agent with `grpc_channel_cache_enabled: false`, send two consecutive checks to same backend, assert both return "up" +- [x] T010 [US1] Add test `test_cache_enabled_default_behavior` in `tests/cache_config_test.rs`: start mock backend (SERVING), start agent with default config (`grpc_channel_cache_enabled: true`), send two consecutive checks, assert both return "up" (regression test for existing behavior) +- [x] T011 [US1] Add test `test_cache_disabled_unreachable_backend` in `tests/cache_config_test.rs`: start agent with `grpc_channel_cache_enabled: false`, send check to nonexistent backend, assert returns "down" + +**Checkpoint**: Cache toggle works. Caching disabled = fresh channel per check. Caching enabled = existing behavior. All integration tests pass. + +--- + +## Phase 4: User Story 2 — Configure Channel Caching via All Supported Methods (Priority: P2) + +**Goal**: The `grpc_channel_cache_enabled` option is configurable via environment variable (`HAPROXY_AGENT_GRPC_CHANNEL_CACHE`), TOML config file, and CLI argument (`--grpc-channel-cache` / `--no-grpc-channel-cache`), following existing precedence. + +**Independent Test**: Set the option via each config method independently, verify agent respects the setting. + +### Implementation for User Story 2 + +- [x] T012 [US2] Add `HAPROXY_AGENT_GRPC_CHANNEL_CACHE` environment variable parsing in `AgentConfig::load_from_env` in `src/config.rs` — parse "true"/"false" strings, bail on invalid values +- [x] T013 [US2] Add `--grpc-channel-cache` / `--no-grpc-channel-cache` CLI argument to `CliArgs` struct in `src/config.rs` using `#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]` +- [x] T014 [US2] Add `grpc_channel_cache` override in `AgentConfig::apply_cli_overrides` in `src/config.rs` — apply `cli.grpc_channel_cache` to `config.grpc_channel_cache_enabled` + +### Unit Tests for User Story 2 + +- [x] T015 [US2] Add unit test `test_config_default_grpc_channel_cache_enabled` in `src/config.rs` — verify `AgentConfig::default().grpc_channel_cache_enabled == true` + +**Checkpoint**: Config option works via all three methods with correct precedence. Existing tests pass. + +--- + +## Phase 5: Polish & Cross-Cutting Concerns + +**Purpose**: Final validation across all stories. + +- [x] T016 Run `cargo clippy` and fix any warnings introduced by this feature +- [x] T017 Run full test suite (`cargo test`) and verify all existing + new tests pass +- [x] T018 Validate quickstart.md scenarios manually: verify env var, TOML, and CLI examples from `specs/003-channel-cache-config/quickstart.md` are accurate + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies — can start immediately +- **Foundational (Phase 2)**: T002-T004 are sequential (field depends on default fn). T005-T006 can start after T004. +- **User Story 1 (Phase 3)**: Depends on Phase 2 completion. T007-T008 are sequential. T009-T011 depend on T007-T008 and T001. +- **User Story 2 (Phase 4)**: Depends on Phase 2 completion. Independent of US1. T012-T014 can run in parallel (different functions). +- **Polish (Phase 5)**: Depends on Phases 3 and 4 completion. + +### User Story Dependencies + +- **User Story 1 (P1)**: Depends on Phase 2 only. No dependency on US2. +- **User Story 2 (P2)**: Depends on Phase 2 only. No dependency on US1. +- US1 and US2 can be implemented in parallel after Phase 2. + +### Within Each User Story + +- Implementation tasks before test tasks (tests need the code to test against) +- Core logic (T007) before metric handling (T008) +- Config parsing tasks (T012-T014) can run in parallel (different functions in same file) + +### Parallel Opportunities + +- T001 (test helper) can run in parallel with T002-T006 (config changes) — different files +- T012, T013, T014 can run in parallel — different functions in config.rs +- US1 and US2 phases can run in parallel after Phase 2 + +--- + +## Parallel Example: User Story 1 + +```text +# After Phase 2 complete, launch implementation: +Task T007: "Modify get_or_create_channel cache bypass in src/checker.rs" +Task T008: "Verify metric handling on no-cache path in src/checker.rs" + +# After T007+T008 complete, launch tests: +Task T009: "Test cache disabled with healthy backend" +Task T010: "Test cache enabled default behavior" +Task T011: "Test cache disabled with unreachable backend" +``` + +## Parallel Example: User Story 2 + +```text +# After Phase 2 complete, launch all config source tasks: +Task T012: "Add HAPROXY_AGENT_GRPC_CHANNEL_CACHE env var parsing in src/config.rs" +Task T013: "Add --grpc-channel-cache CLI arg in src/config.rs" +Task T014: "Add CLI override in apply_cli_overrides in src/config.rs" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup (T001) +2. Complete Phase 2: Foundational (T002-T006) +3. Complete Phase 3: User Story 1 (T007-T011) +4. **STOP and VALIDATE**: `cargo test --test cache_config_test` — all pass +5. Feature is usable with TOML config file only (serde handles it) + +### Incremental Delivery + +1. Setup + Foundational → Config field exists with default +2. Add User Story 1 → Cache toggle works → Test independently +3. Add User Story 2 → All config methods work → Test independently +4. Polish → Full validation + +--- + +## Notes + +- [P] tasks = different files, no dependencies +- [Story] label maps task to specific user story for traceability +- Constitution requires integration tests (Principle II) — test tasks included +- No new dependencies needed — all changes use existing crates +- Commit after each phase completion +- Stop at any checkpoint to validate independently diff --git a/src/checker.rs b/src/checker.rs index 5f1f451..6436725 100644 --- a/src/checker.rs +++ b/src/checker.rs @@ -49,24 +49,27 @@ impl GrpcHealthChecker { key: &BackendChannelKey, proxy_host: &str, ) -> Result { - // Check if channel exists in cache - if let Some(channel) = self.channel_cache.get(key) { - let channel_clone = channel.clone(); - drop(channel); // Release the DashMap lock - - // Try to get the channel ready with a very short timeout - let ready_check = tokio::time::timeout(Duration::from_millis(10), async { - let mut grpc = tonic::client::Grpc::new(channel_clone.clone()); - grpc.ready().await - }); - - if ready_check.await.is_ok() { - return Ok(channel_clone); + // Only use cache when channel caching is enabled + if self.config.grpc_channel_cache_enabled { + // Check if channel exists in cache + if let Some(channel) = self.channel_cache.get(key) { + let channel_clone = channel.clone(); + drop(channel); // Release the DashMap lock + + // Try to get the channel ready with a very short timeout + let ready_check = tokio::time::timeout(Duration::from_millis(10), async { + let mut grpc = tonic::client::Grpc::new(channel_clone.clone()); + grpc.ready().await + }); + + if ready_check.await.is_ok() { + return Ok(channel_clone); + } + + // Channel not ready, remove from cache + self.channel_cache.remove(key); + metrics::GRPC_CHANNELS_ACTIVE.set(self.channel_cache.len() as f64); } - - // Channel not ready, remove from cache - self.channel_cache.remove(key); - metrics::GRPC_CHANNELS_ACTIVE.set(self.channel_cache.len() as f64); } // T061: Create new channel with connect timeout @@ -110,11 +113,12 @@ impl GrpcHealthChecker { .await .map_err(|e| anyhow::anyhow!("Connection failed to {}: {}", endpoint, e))?; - // Cache the channel - self.channel_cache.insert(key.clone(), channel.clone()); - - // T127: Update GRPC_CHANNELS_ACTIVE gauge - metrics::GRPC_CHANNELS_ACTIVE.set(self.channel_cache.len() as f64); + // Only cache channel and update metric when caching is enabled + if self.config.grpc_channel_cache_enabled { + self.channel_cache.insert(key.clone(), channel.clone()); + // T127: Update GRPC_CHANNELS_ACTIVE gauge + metrics::GRPC_CHANNELS_ACTIVE.set(self.channel_cache.len() as f64); + } Ok(channel) } diff --git a/src/config.rs b/src/config.rs index 183fd83..218a341 100644 --- a/src/config.rs +++ b/src/config.rs @@ -44,6 +44,9 @@ pub struct AgentConfig { #[serde(default = "default_grpc_rpc_timeout")] pub grpc_rpc_timeout_ms: u64, + #[serde(default = "default_grpc_channel_cache_enabled")] + pub grpc_channel_cache_enabled: bool, + #[serde(default = "default_metrics_port")] pub metrics_port: u16, @@ -78,6 +81,10 @@ fn default_grpc_rpc_timeout() -> u64 { 1500 } +fn default_grpc_channel_cache_enabled() -> bool { + true +} + impl Default for AgentConfig { fn default() -> Self { AgentConfig { @@ -85,6 +92,7 @@ impl Default for AgentConfig { server_bind_address: default_bind_address(), grpc_connect_timeout_ms: default_grpc_connect_timeout(), grpc_rpc_timeout_ms: default_grpc_rpc_timeout(), + grpc_channel_cache_enabled: default_grpc_channel_cache_enabled(), metrics_port: default_metrics_port(), metrics_bind_address: default_bind_address(), log_level: LogLevel::default(), @@ -126,6 +134,10 @@ pub struct CliArgs { #[arg(long)] pub grpc_rpc_timeout: Option, + /// Enable or disable gRPC channel caching (default: true) + #[arg(long, num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set)] + pub grpc_channel_cache: Option, + /// Log level #[arg(long, value_enum)] pub log_level: Option, @@ -257,6 +269,14 @@ impl AgentConfig { }; } + if let Ok(cache) = std::env::var("HAPROXY_AGENT_GRPC_CHANNEL_CACHE") { + config.grpc_channel_cache_enabled = match cache.to_lowercase().as_str() { + "true" => true, + "false" => false, + _ => anyhow::bail!("Invalid HAPROXY_AGENT_GRPC_CHANNEL_CACHE value: {} (expected 'true' or 'false')", cache), + }; + } + Ok(config) } @@ -305,6 +325,10 @@ impl AgentConfig { config.log_format = format; } + if let Some(cache) = cli.grpc_channel_cache { + config.grpc_channel_cache_enabled = cache; + } + config } } @@ -323,6 +347,7 @@ mod tests { grpc_rpc_timeout_ms: 1500, metrics_port: 9090, metrics_bind_address: "0.0.0.0".to_string(), + grpc_channel_cache_enabled: true, log_level: LogLevel::Info, log_format: LogFormat::Json, }; @@ -338,6 +363,7 @@ mod tests { server_bind_address: "127.0.0.1".to_string(), grpc_connect_timeout_ms: 500, grpc_rpc_timeout_ms: 1000, + grpc_channel_cache_enabled: true, metrics_port: 9091, metrics_bind_address: "127.0.0.1".to_string(), log_level: LogLevel::Debug, @@ -356,6 +382,7 @@ mod tests { server_bind_address: "0.0.0.0".to_string(), grpc_connect_timeout_ms: 1000, grpc_rpc_timeout_ms: 1500, + grpc_channel_cache_enabled: true, metrics_port: 9090, // Same as server_port! metrics_bind_address: "0.0.0.0".to_string(), log_level: LogLevel::Info, @@ -375,6 +402,7 @@ mod tests { server_bind_address: "0.0.0.0".to_string(), grpc_connect_timeout_ms: 1000, grpc_rpc_timeout_ms: 1500, + grpc_channel_cache_enabled: true, metrics_port: 9090, metrics_bind_address: "0.0.0.0".to_string(), log_level: LogLevel::Info, @@ -394,6 +422,7 @@ mod tests { server_bind_address: "0.0.0.0".to_string(), grpc_connect_timeout_ms: 1000, grpc_rpc_timeout_ms: 1500, + grpc_channel_cache_enabled: true, metrics_port: 0, metrics_bind_address: "0.0.0.0".to_string(), log_level: LogLevel::Info, @@ -414,6 +443,7 @@ mod tests { server_bind_address: "0.0.0.0".to_string(), grpc_connect_timeout_ms: 0, // Invalid! grpc_rpc_timeout_ms: 1500, + grpc_channel_cache_enabled: true, metrics_port: 9090, metrics_bind_address: "0.0.0.0".to_string(), log_level: LogLevel::Info, @@ -433,6 +463,7 @@ mod tests { server_bind_address: "0.0.0.0".to_string(), grpc_connect_timeout_ms: 1000, grpc_rpc_timeout_ms: 0, // Invalid! + grpc_channel_cache_enabled: true, metrics_port: 9090, metrics_bind_address: "0.0.0.0".to_string(), log_level: LogLevel::Info, @@ -456,4 +487,13 @@ mod tests { assert_eq!(config.metrics_port, 9090); assert_eq!(config.metrics_bind_address, "0.0.0.0"); } + + #[test] + fn test_config_default_grpc_channel_cache_enabled() { + let config = AgentConfig::default(); + assert!( + config.grpc_channel_cache_enabled, + "grpc_channel_cache_enabled should default to true" + ); + } } diff --git a/src/main.rs b/src/main.rs index b870afa..7647b57 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,6 +56,7 @@ async fn main() -> Result<()> { metrics_port = config.metrics_port, grpc_connect_timeout_ms = config.grpc_connect_timeout_ms, grpc_rpc_timeout_ms = config.grpc_rpc_timeout_ms, + grpc_channel_cache_enabled = config.grpc_channel_cache_enabled, log_level = ?config.log_level, log_format = ?config.log_format, "HAProxy gRPC Agent starting" diff --git a/tests/cache_config_test.rs b/tests/cache_config_test.rs new file mode 100644 index 0000000..bdf464a --- /dev/null +++ b/tests/cache_config_test.rs @@ -0,0 +1,94 @@ +// Integration tests for channel cache configuration +// Tests cache-enabled and cache-disabled behavior + +mod common; + +use common::{cleanup_agent, send_check, start_agent_with_config, start_mock_backend}; +use haproxy_grpc_agent::config::AgentConfig; + +// T009: Test that health checks work with caching disabled +#[tokio::test] +async fn test_health_check_cache_disabled() { + let (_container, backend_port) = start_mock_backend("SERVING").await; + + let config = AgentConfig { + server_port: 0, + server_bind_address: "127.0.0.1".to_string(), + metrics_port: 0, + metrics_bind_address: "127.0.0.1".to_string(), + grpc_channel_cache_enabled: false, + ..AgentConfig::default() + }; + + let (handle, agent_addr) = start_agent_with_config(config).await; + + // Send two consecutive checks — both should succeed with fresh channels + let response1 = send_check(agent_addr, "127.0.0.1", backend_port).await; + assert_eq!( + response1, "up", + "First check with cache disabled should return 'up'" + ); + + let response2 = send_check(agent_addr, "127.0.0.1", backend_port).await; + assert_eq!( + response2, "up", + "Second check with cache disabled should return 'up'" + ); + + cleanup_agent(handle); +} + +// T010: Test that default config (cache enabled) preserves existing behavior +#[tokio::test] +async fn test_cache_enabled_default_behavior() { + let (_container, backend_port) = start_mock_backend("SERVING").await; + + let config = AgentConfig { + server_port: 0, + server_bind_address: "127.0.0.1".to_string(), + metrics_port: 0, + metrics_bind_address: "127.0.0.1".to_string(), + grpc_channel_cache_enabled: true, + ..AgentConfig::default() + }; + + let (handle, agent_addr) = start_agent_with_config(config).await; + + // Send two consecutive checks — both should succeed (second uses cached channel) + let response1 = send_check(agent_addr, "127.0.0.1", backend_port).await; + assert_eq!( + response1, "up", + "First check with cache enabled should return 'up'" + ); + + let response2 = send_check(agent_addr, "127.0.0.1", backend_port).await; + assert_eq!( + response2, "up", + "Second check with cache enabled should return 'up' (cached channel)" + ); + + cleanup_agent(handle); +} + +// T011: Test that unreachable backend returns 'down' with caching disabled +#[tokio::test] +async fn test_cache_disabled_unreachable_backend() { + let config = AgentConfig { + server_port: 0, + server_bind_address: "127.0.0.1".to_string(), + metrics_port: 0, + metrics_bind_address: "127.0.0.1".to_string(), + grpc_channel_cache_enabled: false, + ..AgentConfig::default() + }; + + let (handle, agent_addr) = start_agent_with_config(config).await; + + let response = send_check(agent_addr, "nonexistent.example.com", 9999).await; + assert_eq!( + response, "down", + "Unreachable backend with cache disabled should return 'down'" + ); + + cleanup_agent(handle); +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 63dab22..c4f322f 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -118,6 +118,27 @@ pub async fn send_raw_request(agent_addr: SocketAddr, request: &str) -> String { result.expect("Agent request timed out after 5 seconds") } +/// Starts the agent server in-process with a custom config on a dynamic port. +/// Returns the tokio JoinHandle and the bound SocketAddr. +pub async fn start_agent_with_config( + config: AgentConfig, +) -> (tokio::task::JoinHandle>, SocketAddr) { + let listener = TcpListener::bind("127.0.0.1:0") + .await + .expect("Failed to bind agent listener"); + let addr = listener + .local_addr() + .expect("Failed to get agent bound address"); + + let server = AgentServer::new(config); + let handle = tokio::spawn(async move { server.run_with_listener(listener).await }); + + // Brief pause to let the accept loop start + tokio::time::sleep(Duration::from_millis(50)).await; + + (handle, addr) +} + /// Aborts the agent server task. pub fn cleanup_agent(handle: tokio::task::JoinHandle>) { handle.abort(); From 8b4b791c33879b2d2dcaf61c8bc6cbda14460485 Mon Sep 17 00:00:00 2001 From: Alik Kurdyukov Date: Mon, 23 Feb 2026 17:07:09 +0400 Subject: [PATCH 2/2] warning fixed --- src/config.rs | 5 ++++- tests/cache_config_test.rs | 13 ++----------- tests/common/mod.rs | 19 ++----------------- 3 files changed, 8 insertions(+), 29 deletions(-) diff --git a/src/config.rs b/src/config.rs index 218a341..1fe4853 100644 --- a/src/config.rs +++ b/src/config.rs @@ -273,7 +273,10 @@ impl AgentConfig { config.grpc_channel_cache_enabled = match cache.to_lowercase().as_str() { "true" => true, "false" => false, - _ => anyhow::bail!("Invalid HAPROXY_AGENT_GRPC_CHANNEL_CACHE value: {} (expected 'true' or 'false')", cache), + _ => anyhow::bail!( + "Invalid HAPROXY_AGENT_GRPC_CHANNEL_CACHE value: {} (expected 'true' or 'false')", + cache + ), }; } diff --git a/tests/cache_config_test.rs b/tests/cache_config_test.rs index bdf464a..478f398 100644 --- a/tests/cache_config_test.rs +++ b/tests/cache_config_test.rs @@ -3,7 +3,7 @@ mod common; -use common::{cleanup_agent, send_check, start_agent_with_config, start_mock_backend}; +use common::{cleanup_agent, send_check, start_agent, start_agent_with_config, start_mock_backend}; use haproxy_grpc_agent::config::AgentConfig; // T009: Test that health checks work with caching disabled @@ -43,16 +43,7 @@ async fn test_health_check_cache_disabled() { async fn test_cache_enabled_default_behavior() { let (_container, backend_port) = start_mock_backend("SERVING").await; - let config = AgentConfig { - server_port: 0, - server_bind_address: "127.0.0.1".to_string(), - metrics_port: 0, - metrics_bind_address: "127.0.0.1".to_string(), - grpc_channel_cache_enabled: true, - ..AgentConfig::default() - }; - - let (handle, agent_addr) = start_agent_with_config(config).await; + let (handle, agent_addr) = start_agent().await; // Send two consecutive checks — both should succeed (second uses cached channel) let response1 = send_check(agent_addr, "127.0.0.1", backend_port).await; diff --git a/tests/common/mod.rs b/tests/common/mod.rs index c4f322f..091d75e 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,5 +1,4 @@ // Shared test utilities for integration and resilience tests - use haproxy_grpc_agent::config::AgentConfig; use haproxy_grpc_agent::server::AgentServer; use std::net::SocketAddr; @@ -55,7 +54,7 @@ pub async fn start_mock_backend(health_status: &str) -> (ContainerAsync (tokio::task::JoinHandle>, SocketAddr) { let config = AgentConfig { @@ -65,21 +64,7 @@ pub async fn start_agent() -> (tokio::task::JoinHandle>, Sock metrics_bind_address: "127.0.0.1".to_string(), ..AgentConfig::default() }; - - let listener = TcpListener::bind("127.0.0.1:0") - .await - .expect("Failed to bind agent listener"); - let addr = listener - .local_addr() - .expect("Failed to get agent bound address"); - - let server = AgentServer::new(config); - let handle = tokio::spawn(async move { server.run_with_listener(listener).await }); - - // Brief pause to let the accept loop start - tokio::time::sleep(Duration::from_millis(50)).await; - - (handle, addr) + start_agent_with_config(config).await } /// Sends a health check request to the agent and returns the trimmed response.