diff --git a/Cargo.lock b/Cargo.lock index 9f0de00c..12060730 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,49 +427,6 @@ dependencies = [ "fs_extra", ] -[[package]] -name = "axum" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" -dependencies = [ - "axum-core", - "bytes", - "futures-util", - "http", - "http-body", - "http-body-util", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "serde_core", - "sync_wrapper", - "tower", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-core" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "sync_wrapper", - "tower-layer", - "tower-service", -] - [[package]] name = "base64" version = "0.22.1" @@ -1328,7 +1285,6 @@ dependencies = [ "tokio", "tokio-rustls", "toml", - "tonic", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -1362,10 +1318,8 @@ name = "fila-proto" version = "0.1.0" dependencies = [ "prost", + "prost-build", "prost-types", - "tonic", - "tonic-prost", - "tonic-prost-build", ] [[package]] @@ -1374,6 +1328,7 @@ version = "0.1.0" dependencies = [ "bytes", "fila-fibp", + "fila-sdk", "rustls", "rustls-pemfile", "tempfile", @@ -1392,23 +1347,13 @@ dependencies = [ "bytes", "fila-core", "fila-fibp", - "fila-proto", - "futures-core", - "http", - "opentelemetry", - "pin-project-lite", "rustls", "rustls-pemfile", "tempfile", "tokio", "tokio-rustls", - "tokio-stream", "toml", - "tonic", - "tonic-prost", - "tower", "tracing", - "tracing-opentelemetry", "uuid", ] @@ -1775,12 +1720,6 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - [[package]] name = "hyper" version = "1.8.1" @@ -1795,7 +1734,6 @@ dependencies = [ "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", "pin-utils", @@ -2276,24 +2214,12 @@ dependencies = [ "regex-automata", ] -[[package]] -name = "matchit" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" - [[package]] name = "memchr" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2980,8 +2906,6 @@ dependencies = [ "prettyplease", "prost", "prost-types", - "pulldown-cmark", - "pulldown-cmark-to-cmark", "regex", "syn 2.0.114", "tempfile", @@ -3029,26 +2953,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "pulldown-cmark" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" -dependencies = [ - "bitflags 2.10.0", - "memchr", - "unicase", -] - -[[package]] -name = "pulldown-cmark-to-cmark" -version = "22.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50793def1b900256624a709439404384204a5dc3a6ec580281bfaac35e882e90" -dependencies = [ - "pulldown-cmark", -] - [[package]] name = "quick-error" version = "1.2.3" @@ -4259,10 +4163,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a286e33f82f8a1ee2df63f4fa35c0becf4a85a0cb03091a15fd7bf0b402dc94a" dependencies = [ "async-trait", - "axum", "base64", "bytes", - "h2", "http", "http-body", "http-body-util", @@ -4271,10 +4173,8 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "socket2 0.6.2", "sync_wrapper", "tokio", - "tokio-rustls", "tokio-stream", "tower", "tower-layer", @@ -4282,18 +4182,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "tonic-build" -version = "0.14.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27aac809edf60b741e2d7db6367214d078856b8a5bff0087e94ff330fb97b6fc" -dependencies = [ - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.114", -] - [[package]] name = "tonic-prost" version = "0.14.3" @@ -4305,22 +4193,6 @@ dependencies = [ "tonic", ] -[[package]] -name = "tonic-prost-build" -version = "0.14.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4556786613791cfef4ed134aa670b61a85cfcacf71543ef33e8d801abae988f" -dependencies = [ - "prettyplease", - "proc-macro2", - "prost-build", - "prost-types", - "quote", - "syn 2.0.114", - "tempfile", - "tonic-build", -] - [[package]] name = "tower" version = "0.5.3" @@ -4498,12 +4370,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" -[[package]] -name = "unicase" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" - [[package]] name = "unicode-ident" version = "1.0.23" diff --git a/Cargo.toml b/Cargo.toml index 3f5aa6e6..12e4765e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,12 +19,10 @@ homepage = "https://github.com/faiscadev/fila" rust-version = "1.75" [workspace.dependencies] -# gRPC + Protobuf (versions must match) -tonic = { version = "0.14", features = ["tls-ring"] } -tonic-prost = "0.14" +# Protobuf (used for Raft log entry serialization) prost = "0.14" prost-types = "0.14" -tonic-prost-build = "0.14" +prost-build = "0.14" tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } @@ -49,9 +47,6 @@ rcgen = "0.12" sha2 = "0.10" tokio-stream = "0.1" -tower = "0.5" -http = "1" -pin-project-lite = "0.2" fila-proto = { path = "crates/fila-proto", version = "0.1.0" } fila-core = { path = "crates/fila-core", version = "0.1.0" } diff --git a/_bmad-output/implementation-artifacts/epic-execution-state.yaml b/_bmad-output/implementation-artifacts/epic-execution-state.yaml index dd46e4bc..655668d1 100644 --- a/_bmad-output/implementation-artifacts/epic-execution-state.yaml +++ b/_bmad-output/implementation-artifacts/epic-execution-state.yaml @@ -32,9 +32,9 @@ stories: dependsOn: ["20.1", "20.2", "20.3"] - id: "20.5" title: "gRPC Removal & Final Benchmarks" - status: pending + status: completed currentPhase: "" - branch: "" + branch: "feat/20.5-grpc-removal" pr: null dependsOn: ["20.1", "20.2", "20.3", "20.4"] skippedIssues: [] diff --git a/_bmad-output/implementation-artifacts/sprint-status.yaml b/_bmad-output/implementation-artifacts/sprint-status.yaml index 5c063ebc..262dcd31 100644 --- a/_bmad-output/implementation-artifacts/sprint-status.yaml +++ b/_bmad-output/implementation-artifacts/sprint-status.yaml @@ -206,7 +206,7 @@ development_status: 20-2-admin-operations-auth-on-binary-protocol: done 20-3-rust-sdk-binary-protocol-client: done 20-4-cli-cluster-inter-node-migration: done - 20-5-grpc-removal-final-benchmarks: backlog + 20-5-grpc-removal-final-benchmarks: done epic-20-retrospective: optional # Epic 21: External SDK Migration diff --git a/crates/fila-bench/src/server.rs b/crates/fila-bench/src/server.rs index 7e19f7fb..b4fbd5ce 100644 --- a/crates/fila-bench/src/server.rs +++ b/crates/fila-bench/src/server.rs @@ -12,8 +12,6 @@ pub struct BenchServer { child: Option, /// Binary protocol address (host:port) for SDK connections. addr: String, - /// gRPC address (http://host:port) for CLI commands. - grpc_addr: String, _data_dir: tempfile::TempDir, } @@ -25,13 +23,8 @@ impl BenchServer { /// Start a new fila-server instance with a specific DRR quantum. pub fn start_with_quantum(quantum: Option) -> Self { - let grpc_port = free_port(); - let mut binary_port = free_port(); - while binary_port == grpc_port { - binary_port = free_port(); - } - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let port = free_port(); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("create temp dir"); let scheduler_section = match quantum { @@ -40,8 +33,7 @@ impl BenchServer { }; let config_content = format!( r#"[server] -listen_addr = "{grpc_addr}" -binary_addr = "{binary_addr}" +listen_addr = "{addr}" {scheduler_section} [telemetry] otlp_endpoint = "" @@ -78,31 +70,22 @@ otlp_endpoint = "" } }); - // Poll TCP until both ports are reachable. + // Poll TCP until port is reachable. let start = std::time::Instant::now(); - let mut grpc_ok = false; - let mut binary_ok = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_ok && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_ok = true; - } - if !binary_ok && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_ok = true; - } - if grpc_ok && binary_ok { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_ok && binary_ok, + std::net::TcpStream::connect(&addr).is_ok(), "fila-server did not become reachable within 10s" ); Self { child: Some(child), - addr: binary_addr, - grpc_addr: format!("http://{grpc_addr}"), + addr, _data_dir: data_dir, } } @@ -112,9 +95,9 @@ otlp_endpoint = "" &self.addr } - /// The gRPC address (http://host:port) for CLI commands. - pub fn grpc_addr(&self) -> &str { - &self.grpc_addr + /// The address for CLI commands (same as addr). + pub fn cli_addr(&self) -> &str { + &self.addr } /// The raw host:port address for the binary protocol. diff --git a/crates/fila-cli/src/main.rs b/crates/fila-cli/src/main.rs index 29205e48..da9120cc 100644 --- a/crates/fila-cli/src/main.rs +++ b/crates/fila-cli/src/main.rs @@ -350,11 +350,7 @@ async fn connect( root_store.extend(webpki_roots::TLS_SERVER_ROOTS.iter().cloned()); } - let mut config = rustls::ClientConfig::builder() - .with_root_certificates(root_store.clone()) - .with_no_client_auth(); - - if let (Some(cert_path), Some(key_path)) = (tls_cert, tls_key) { + let config = if let (Some(cert_path), Some(key_path)) = (tls_cert, tls_key) { let cert_pem = match std::fs::read(cert_path) { Ok(b) => b, Err(e) => { @@ -389,14 +385,18 @@ async fn connect( process::exit(1); }); - config = rustls::ClientConfig::builder() - .with_root_certificates(root_store.clone()) + rustls::ClientConfig::builder() + .with_root_certificates(root_store) .with_client_auth_cert(certs, key) .unwrap_or_else(|e| { eprintln!("Error: TLS client auth configuration failed: {e}"); process::exit(1); - }); - } + }) + } else { + rustls::ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth() + }; // Extract hostname from addr for SNI let hostname = addr.split(':').next().unwrap_or("localhost"); diff --git a/crates/fila-core/Cargo.toml b/crates/fila-core/Cargo.toml index 47b30cc0..7b1548d3 100644 --- a/crates/fila-core/Cargo.toml +++ b/crates/fila-core/Cargo.toml @@ -29,7 +29,6 @@ tokio = { version = "1", features = ["sync", "rt", "rt-multi-thread", "macros", mlua.workspace = true openraft.workspace = true nonempty.workspace = true -tonic.workspace = true sha2.workspace = true prost.workspace = true prost-types.workspace = true diff --git a/crates/fila-core/src/broker/config.rs b/crates/fila-core/src/broker/config.rs index 416529a6..3317b125 100644 --- a/crates/fila-core/src/broker/config.rs +++ b/crates/fila-core/src/broker/config.rs @@ -32,7 +32,7 @@ pub struct ClusterConfig { /// When empty, this node bootstraps a new single-node cluster. /// When non-empty, the node joins an existing cluster by contacting these peers. pub peers: Vec, - /// Listen address for intra-cluster gRPC communication (separate from client port). + /// Listen address for intra-cluster communication (separate from client port). pub bind_addr: String, /// Raft election timeout in milliseconds. pub election_timeout_ms: u64, @@ -68,7 +68,7 @@ pub struct TlsParams { /// Authentication configuration. Presence in `BrokerConfig.auth` (i.e. an `[auth]` section /// in `fila.toml`) enables API key authentication; absence disables it entirely. /// -/// When enabled, every gRPC RPC must include `authorization: Bearer ` metadata. +/// When enabled, every request must include a valid API key during the connection handshake. /// `bootstrap_apikey` is required: it is a permanent operator key accepted without a storage /// lookup, enabling operators to provision the first real API key. It can also be set (or /// overridden) via the `FILA_BOOTSTRAP_APIKEY` environment variable. @@ -83,14 +83,12 @@ pub struct AuthConfig { pub bootstrap_apikey: String, } -/// Server configuration (listen addresses). +/// Server configuration (listen address). #[derive(Debug, Clone, Deserialize)] #[serde(default)] pub struct ServerConfig { - /// gRPC listen address (temporary, for cluster comms until binary protocol migration). + /// Binary protocol (FIBP) listen address. pub listen_addr: String, - /// Binary protocol (FIBP) listen address. Defaults to None (disabled). - pub binary_addr: Option, } /// Scheduler configuration (channel capacity, idle timeout, DRR quantum). @@ -163,7 +161,6 @@ impl Default for ServerConfig { fn default() -> Self { Self { listen_addr: "0.0.0.0:5555".to_string(), - binary_addr: None, } } } @@ -201,7 +198,6 @@ mod tests { fn default_config_values() { let config = BrokerConfig::default(); assert_eq!(config.server.listen_addr, "0.0.0.0:5555"); - assert_eq!(config.server.binary_addr, None); assert_eq!(config.scheduler.command_channel_capacity, 10_000); assert_eq!(config.scheduler.idle_timeout_ms, 100); assert_eq!(config.scheduler.quantum, 1000); diff --git a/crates/fila-core/src/broker/mod.rs b/crates/fila-core/src/broker/mod.rs index 6ebe2f18..37ba7bba 100644 --- a/crates/fila-core/src/broker/mod.rs +++ b/crates/fila-core/src/broker/mod.rs @@ -22,7 +22,7 @@ pub use config::{AuthConfig, BrokerConfig, TlsParams}; use scheduler::Scheduler; /// The broker owns the scheduler thread and the inbound command channel. -/// IO threads (gRPC handlers) send commands through `send_command()`, +/// IO threads (connection handlers) send commands through `send_command()`, /// and the single-threaded scheduler processes them sequentially. pub struct Broker { command_tx: crossbeam_channel::Sender, diff --git a/crates/fila-core/src/broker/scheduler/recovery.rs b/crates/fila-core/src/broker/scheduler/recovery.rs index d1ddbcf6..7bfff391 100644 --- a/crates/fila-core/src/broker/scheduler/recovery.rs +++ b/crates/fila-core/src/broker/scheduler/recovery.rs @@ -372,7 +372,7 @@ impl Scheduler { /// /// When entries are removed, the `ConsumerEntry.tx` channels are dropped, /// causing the converter tasks in service.rs to detect closure and end - /// the gRPC streams. Consumers will receive a disconnection and must + /// the streams. Consumers will receive a disconnection and must /// reconnect to the new leader. pub(super) fn drop_queue_consumers(&mut self, queue_id: &str) { let before = self.consumers.len(); diff --git a/crates/fila-core/src/cluster/binary_service.rs b/crates/fila-core/src/cluster/binary_service.rs index d40155c6..ca29b9a5 100644 --- a/crates/fila-core/src/cluster/binary_service.rs +++ b/crates/fila-core/src/cluster/binary_service.rs @@ -1,8 +1,8 @@ //! Binary protocol service for cluster inter-node communication. //! -//! Replaces the gRPC-based `ClusterGrpcService` with a TCP listener that speaks -//! the FIBP (Fila Binary Protocol). Raft payloads remain protobuf-serialized -//! but are transported as opaque bytes inside binary protocol frames. +//! Uses TCP with FIBP (Fila Binary Protocol) framing. Raft payloads remain +//! protobuf-serialized but are transported as opaque bytes inside binary +//! protocol frames. use std::sync::Arc; @@ -24,6 +24,107 @@ use super::proto_convert; use super::types::{NodeId, TypeConfig}; use crate::Broker; +/// Apply a forwarded write to the local scheduler after Raft commit. +/// This ensures the leader's scheduler has the data for serving consumers. +pub(super) async fn apply_to_scheduler(broker: &crate::Broker, req: &super::types::ClusterRequest) { + match req { + super::types::ClusterRequest::Enqueue { messages } => { + if messages.is_empty() { + return; + } + let msgs = messages.clone(); + let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); + if let Err(e) = broker.send_command(crate::SchedulerCommand::Enqueue { + messages: msgs, + reply: reply_tx, + }) { + tracing::error!(error = %e, "failed to apply forwarded enqueue to scheduler"); + return; + } + match reply_rx.await { + Err(e) => { + tracing::error!(error = %e, "scheduler dropped reply for forwarded enqueue"); + } + Ok(results) => { + for result in results { + if let Err(e) = result { + tracing::error!(error = %e, "scheduler rejected forwarded enqueue"); + } + } + } + } + } + super::types::ClusterRequest::Ack { items } => { + if items.is_empty() { + return; + } + let ack_items: Vec = items + .iter() + .map(|i| crate::broker::command::AckItem { + queue_id: i.queue_id.clone(), + msg_id: i.msg_id, + }) + .collect(); + let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); + if let Err(e) = broker.send_command(crate::SchedulerCommand::Ack { + items: ack_items, + reply: reply_tx, + }) { + tracing::error!(error = %e, "failed to apply forwarded ack to scheduler"); + return; + } + match reply_rx.await { + Err(e) => { + tracing::error!(error = %e, "scheduler dropped reply for forwarded ack"); + } + Ok(results) => { + for result in results { + if let Err(e) = result { + tracing::error!(error = %e, "scheduler rejected forwarded ack"); + } + } + } + } + } + super::types::ClusterRequest::Nack { items } => { + if items.is_empty() { + return; + } + let nack_items: Vec = items + .iter() + .map(|i| crate::broker::command::NackItem { + queue_id: i.queue_id.clone(), + msg_id: i.msg_id, + error: i.error.clone(), + }) + .collect(); + let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); + if let Err(e) = broker.send_command(crate::SchedulerCommand::Nack { + items: nack_items, + reply: reply_tx, + }) { + tracing::error!(error = %e, "failed to apply forwarded nack to scheduler"); + return; + } + match reply_rx.await { + Err(e) => { + tracing::error!(error = %e, "scheduler dropped reply for forwarded nack"); + } + Ok(results) => { + for result in results { + if let Err(e) = result { + tracing::error!(error = %e, "scheduler rejected forwarded nack"); + } + } + } + } + } + // Other request types (CreateQueue, DeleteQueue, etc.) are handled + // through the meta Raft event system, not here. + _ => {} + } +} + /// Shared state for the cluster binary protocol service. pub struct ClusterBinaryService { meta_raft: Arc>, @@ -417,7 +518,7 @@ async fn handle_client_write( // Apply to local scheduler so forwarded writes have // real side effects on the leader node. if let Some(broker) = service.broker.get() { - super::grpc_service::apply_to_scheduler(broker, &cluster_req).await; + apply_to_scheduler(broker, &cluster_req).await; } let response_proto = fila_proto::ClusterResponseProto::from(resp.data); diff --git a/crates/fila-core/src/cluster/grpc_service.rs b/crates/fila-core/src/cluster/grpc_service.rs deleted file mode 100644 index d76ebaff..00000000 --- a/crates/fila-core/src/cluster/grpc_service.rs +++ /dev/null @@ -1,402 +0,0 @@ -use std::sync::Arc; - -use openraft::error::RaftError; -use openraft::{BasicNode, Raft}; -use tonic::{Request, Response, Status}; - -use super::multi_raft::MultiRaftManager; -use super::proto_convert; -use super::types::{NodeId, TypeConfig}; -use crate::Broker; -use fila_proto::fila_cluster_server::FilaCluster; -use fila_proto::{ - AddNodeRequest, AddNodeResponse, GetNodeInfoRequest, GetNodeInfoResponse, - RaftAppendEntriesRequest, RaftAppendEntriesResponse, RaftClientWriteRequest, - RaftClientWriteResponse, RaftInstallSnapshotRequest, RaftInstallSnapshotResponse, - RaftVoteRequest, RaftVoteResponse, RemoveNodeRequest, RemoveNodeResponse, -}; - -/// gRPC service handler that forwards Raft RPCs to the correct local Raft -/// instance — either the meta group or a queue-level group based on `group_id`. -pub struct ClusterGrpcService { - meta_raft: Arc>, - multi_raft: Arc, - /// Broker reference for applying forwarded writes to the local scheduler. - /// Set after Broker creation via OnceLock — None during initial startup. - broker: Arc>>, - /// This node's ID and client-facing gRPC address (for GetNodeInfo RPC). - node_id: NodeId, - client_addr: String, -} - -impl ClusterGrpcService { - pub fn new( - meta_raft: Arc>, - multi_raft: Arc, - broker: Arc>>, - node_id: NodeId, - client_addr: String, - ) -> Self { - Self { - meta_raft, - multi_raft, - broker, - node_id, - client_addr, - } - } - - /// Resolve which Raft instance should handle this RPC. - async fn resolve_raft(&self, group_id: &str) -> Result>, Status> { - if group_id.is_empty() { - Ok(Arc::clone(&self.meta_raft)) - } else { - self.multi_raft - .get_raft(group_id) - .await - .ok_or_else(|| Status::not_found(format!("unknown raft group: {group_id}"))) - } - } -} - -#[tonic::async_trait] -impl FilaCluster for ClusterGrpcService { - async fn append_entries( - &self, - request: Request, - ) -> Result, Status> { - let inner = request.into_inner(); - let raft = self.resolve_raft(&inner.group_id).await?; - - let req = proto_convert::append_entries_request_from_proto(inner) - .map_err(|e| Status::invalid_argument(format!("deserialize: {e}")))?; - - match raft.append_entries(req).await { - Ok(resp) => Ok(Response::new( - proto_convert::append_entries_response_to_proto(resp), - )), - Err(e) => Err(Status::internal(format!("raft error: {e}"))), - } - } - - async fn vote( - &self, - request: Request, - ) -> Result, Status> { - let inner = request.into_inner(); - let raft = self.resolve_raft(&inner.group_id).await?; - - let req = proto_convert::vote_request_from_proto(inner) - .map_err(|e| Status::invalid_argument(format!("deserialize: {e}")))?; - - match raft.vote(req).await { - Ok(resp) => Ok(Response::new(proto_convert::vote_response_to_proto(resp))), - Err(e) => Err(Status::internal(format!("raft error: {e}"))), - } - } - - async fn install_snapshot( - &self, - request: Request, - ) -> Result, Status> { - let inner = request.into_inner(); - let raft = self.resolve_raft(&inner.group_id).await?; - - let req = proto_convert::install_snapshot_request_from_proto(inner) - .map_err(|e| Status::invalid_argument(format!("deserialize: {e}")))?; - - match raft.install_snapshot(req).await { - Ok(resp) => Ok(Response::new( - proto_convert::install_snapshot_response_to_proto(resp), - )), - Err(e) => Err(Status::internal(format!("raft error: {e}"))), - } - } - - async fn add_node( - &self, - request: Request, - ) -> Result, Status> { - let req = request.into_inner(); - let node_id = req.node_id; - let node = BasicNode { - addr: req.addr.clone(), - }; - - // Store the joining node's client address for leader hint routing. - if !req.client_addr.is_empty() { - self.multi_raft - .register_client_addr(node_id, &req.client_addr) - .await; - } - - // Add/remove node only affects the meta Raft group. - if let Err(e) = self.meta_raft.add_learner(node_id, node, true).await { - return Ok(Response::new(handle_membership_error(e))); - } - - let mut members = std::collections::BTreeSet::new(); - members.insert(node_id); - match self - .meta_raft - .change_membership(openraft::ChangeMembers::AddVoterIds(members), false) - .await - { - Ok(_) => Ok(Response::new(AddNodeResponse { - success: true, - error: String::new(), - leader_addr: String::new(), - })), - Err(e) => Ok(Response::new(handle_membership_error(e))), - } - } - - async fn get_node_info( - &self, - _request: Request, - ) -> Result, Status> { - Ok(Response::new(GetNodeInfoResponse { - node_id: self.node_id, - client_addr: self.client_addr.clone(), - })) - } - - async fn client_write( - &self, - request: Request, - ) -> Result, Status> { - let inner = request.into_inner(); - let group_id = inner.group_id.clone(); - let raft = self.resolve_raft(&group_id).await?; - - let req: super::types::ClusterRequest = inner - .request - .ok_or_else(|| Status::invalid_argument("missing request"))? - .try_into() - .map_err(|e: super::proto_convert::ConvertError| { - Status::invalid_argument(format!("deserialize: {e}")) - })?; - - match raft.client_write(req.clone()).await { - Ok(resp) => { - // Apply to local scheduler so forwarded writes have - // real side effects on the leader node. - if let Some(broker) = self.broker.get() { - apply_to_scheduler(broker, &req).await; - } - - let response_proto = fila_proto::ClusterResponseProto::from(resp.data); - Ok(Response::new(RaftClientWriteResponse { - response: Some(response_proto), - })) - } - Err(openraft::error::RaftError::APIError( - openraft::error::ClientWriteError::ForwardToLeader(fwd), - )) => { - let leader_addr = fwd - .leader_node - .as_ref() - .map(|n| n.addr.clone()) - .unwrap_or_default(); - // Signal forwarding via error response with leader address. - let response_proto = - fila_proto::ClusterResponseProto::from(super::types::ClusterResponse::Error { - message: format!("ForwardToLeader:{leader_addr}"), - }); - Ok(Response::new(RaftClientWriteResponse { - response: Some(response_proto), - })) - } - Err(e) => Err(Status::internal(format!("raft error: {e}"))), - } - } - - async fn remove_node( - &self, - request: Request, - ) -> Result, Status> { - let req = request.into_inner(); - let node_id = req.node_id; - - let mut members = std::collections::BTreeSet::new(); - members.insert(node_id); - match self - .meta_raft - .change_membership(openraft::ChangeMembers::RemoveVoters(members), false) - .await - { - Ok(_) => Ok(Response::new(RemoveNodeResponse { - success: true, - error: String::new(), - leader_addr: String::new(), - })), - Err(e) => Ok(Response::new(handle_remove_error(e))), - } - } -} - -/// Apply a forwarded write to the local scheduler after Raft commit. -/// This ensures the leader's scheduler has the data for serving consumers. -/// Shared between the gRPC and binary protocol cluster services. -pub(super) async fn apply_to_scheduler(broker: &Broker, req: &super::types::ClusterRequest) { - match req { - super::types::ClusterRequest::Enqueue { messages } => { - if messages.is_empty() { - return; - } - let msgs = messages.clone(); - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - if let Err(e) = broker.send_command(crate::SchedulerCommand::Enqueue { - messages: msgs, - reply: reply_tx, - }) { - tracing::error!(error = %e, "failed to apply forwarded enqueue to scheduler"); - return; - } - match reply_rx.await { - Err(e) => { - tracing::error!(error = %e, "scheduler dropped reply for forwarded enqueue"); - } - Ok(results) => { - for result in results { - if let Err(e) = result { - tracing::error!(error = %e, "scheduler rejected forwarded enqueue"); - } - } - } - } - } - super::types::ClusterRequest::Ack { items } => { - if items.is_empty() { - return; - } - let ack_items: Vec = items - .iter() - .map(|i| crate::broker::command::AckItem { - queue_id: i.queue_id.clone(), - msg_id: i.msg_id, - }) - .collect(); - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - if let Err(e) = broker.send_command(crate::SchedulerCommand::Ack { - items: ack_items, - reply: reply_tx, - }) { - tracing::error!(error = %e, "failed to apply forwarded ack to scheduler"); - return; - } - match reply_rx.await { - Err(e) => { - tracing::error!(error = %e, "scheduler dropped reply for forwarded ack"); - } - Ok(results) => { - for result in results { - if let Err(e) = result { - tracing::error!(error = %e, "scheduler rejected forwarded ack"); - } - } - } - } - } - super::types::ClusterRequest::Nack { items } => { - if items.is_empty() { - return; - } - let nack_items: Vec = items - .iter() - .map(|i| crate::broker::command::NackItem { - queue_id: i.queue_id.clone(), - msg_id: i.msg_id, - error: i.error.clone(), - }) - .collect(); - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - if let Err(e) = broker.send_command(crate::SchedulerCommand::Nack { - items: nack_items, - reply: reply_tx, - }) { - tracing::error!(error = %e, "failed to apply forwarded nack to scheduler"); - return; - } - match reply_rx.await { - Err(e) => { - tracing::error!(error = %e, "scheduler dropped reply for forwarded nack"); - } - Ok(results) => { - for result in results { - if let Err(e) = result { - tracing::error!(error = %e, "scheduler rejected forwarded nack"); - } - } - } - } - } - // Other request types (CreateQueue, DeleteQueue, etc.) are handled - // through the meta Raft event system, not here. - _ => {} - } -} - -/// Convert a membership change error into an AddNodeResponse with leader hint. -fn handle_membership_error( - error: RaftError>, -) -> AddNodeResponse { - match &error { - RaftError::APIError(write_err) => match write_err { - openraft::error::ClientWriteError::ForwardToLeader(fwd) => { - let leader_addr = fwd - .leader_node - .as_ref() - .map(|n| n.addr.clone()) - .unwrap_or_default(); - AddNodeResponse { - success: false, - error: "not leader".to_string(), - leader_addr, - } - } - openraft::error::ClientWriteError::ChangeMembershipError(e) => AddNodeResponse { - success: false, - error: format!("{e}"), - leader_addr: String::new(), - }, - }, - RaftError::Fatal(e) => AddNodeResponse { - success: false, - error: format!("fatal: {e}"), - leader_addr: String::new(), - }, - } -} - -/// Convert a membership change error into a RemoveNodeResponse with leader hint. -fn handle_remove_error( - error: RaftError>, -) -> RemoveNodeResponse { - match &error { - RaftError::APIError(write_err) => match write_err { - openraft::error::ClientWriteError::ForwardToLeader(fwd) => { - let leader_addr = fwd - .leader_node - .as_ref() - .map(|n| n.addr.clone()) - .unwrap_or_default(); - RemoveNodeResponse { - success: false, - error: "not leader".to_string(), - leader_addr, - } - } - openraft::error::ClientWriteError::ChangeMembershipError(e) => RemoveNodeResponse { - success: false, - error: format!("{e}"), - leader_addr: String::new(), - }, - }, - RaftError::Fatal(e) => RemoveNodeResponse { - success: false, - error: format!("fatal: {e}"), - leader_addr: String::new(), - }, - } -} diff --git a/crates/fila-core/src/cluster/mod.rs b/crates/fila-core/src/cluster/mod.rs index 58f7f5c3..9f8bcc44 100644 --- a/crates/fila-core/src/cluster/mod.rs +++ b/crates/fila-core/src/cluster/mod.rs @@ -1,5 +1,4 @@ pub mod binary_service; -pub mod grpc_service; pub mod multi_raft; pub mod network; pub mod proto_convert; @@ -97,7 +96,7 @@ pub struct ClusterWriteResult { impl ClusterHandle { /// Submit a write to a queue's Raft group. If this node is not the /// leader, the request is transparently forwarded to the leader via - /// the cluster gRPC `ClientWrite` RPC. + /// the cluster binary protocol `ClientWrite` RPC. /// /// Returns a `ClusterWriteResult` indicating whether the write was /// handled locally (caller should apply to scheduler) or forwarded @@ -173,7 +172,7 @@ impl ClusterHandle { Some(leader == Some(self.node_id)) } - /// Get the leader's client-facing gRPC address for a queue. Returns `None` + /// Get the leader's client-facing address for a queue. Returns `None` /// if the queue group doesn't exist, no leader is elected, or the leader's /// client address is unknown. pub async fn get_queue_leader_client_addr(&self, queue_id: &str) -> Option { diff --git a/crates/fila-core/src/cluster/multi_raft.rs b/crates/fila-core/src/cluster/multi_raft.rs index 13f02572..dd23c6df 100644 --- a/crates/fila-core/src/cluster/multi_raft.rs +++ b/crates/fila-core/src/cluster/multi_raft.rs @@ -40,7 +40,7 @@ pub struct MultiRaftManager { /// "queue doesn't exist" (`QueueGroupNotFound`) from "node still catching /// up" (`NodeNotReady`). expected_queues: RwLock>, - /// Mapping of node_id → client-facing gRPC address. Used for leader hint + /// Mapping of node_id → client-facing address. Used for leader hint /// routing: when a consumer connects to a non-leader, the server includes /// the leader's client address in the error response. Populated via /// AddNode requests and GetNodeInfo RPCs. @@ -214,7 +214,7 @@ impl MultiRaftManager { self.expected_queues.read().await.contains(queue_id) } - /// Register a node's client-facing gRPC address for leader hint routing. + /// Register a node's client-facing address for leader hint routing. pub async fn register_client_addr(&self, node_id: NodeId, addr: &str) { self.client_addrs .write() @@ -222,7 +222,7 @@ impl MultiRaftManager { .insert(node_id, addr.to_string()); } - /// Look up a node's client-facing gRPC address. + /// Look up a node's client-facing address. pub async fn get_client_addr(&self, node_id: NodeId) -> Option { self.client_addrs.read().await.get(&node_id).cloned() } diff --git a/crates/fila-core/src/cluster/tests.rs b/crates/fila-core/src/cluster/tests.rs index f63dc709..7d991820 100644 --- a/crates/fila-core/src/cluster/tests.rs +++ b/crates/fila-core/src/cluster/tests.rs @@ -1003,7 +1003,7 @@ mod tests { .collect(); assert_eq!(survivors.len(), 2); - // Kill the leader: shut down its Raft instances and gRPC server. + // Kill the leader: shut down its Raft instances and cluster server. match queue_leader_id { 1 => { node1.multi_raft.shutdown_all().await; diff --git a/crates/fila-core/src/telemetry.rs b/crates/fila-core/src/telemetry.rs index 4644b472..998ed18b 100644 --- a/crates/fila-core/src/telemetry.rs +++ b/crates/fila-core/src/telemetry.rs @@ -136,7 +136,7 @@ fn init_otel_pipeline( opentelemetry::global::set_meter_provider(meter_provider.clone()); - // Set W3C TraceContext propagator for gRPC metadata extraction/injection + // Set W3C TraceContext propagator for trace context extraction/injection opentelemetry::global::set_text_map_propagator( opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); diff --git a/crates/fila-e2e/tests/acl.rs b/crates/fila-e2e/tests/acl.rs index 41ec7c41..113f411d 100644 --- a/crates/fila-e2e/tests/acl.rs +++ b/crates/fila-e2e/tests/acl.rs @@ -284,7 +284,7 @@ async fn admin_operations_require_admin_permission() { "create queue with produce-only key should fail; stdout={} stderr={}", out.stdout, out.stderr ); - // The CLI wraps the gRPC PERMISSION_DENIED status as "Error: key does not have admin permission". + // The CLI wraps the PERMISSION_DENIED error as "Error: key does not have admin permission". assert!( out.stderr.contains("admin permission"), "expected admin permission error, got: {}", diff --git a/crates/fila-e2e/tests/cluster.rs b/crates/fila-e2e/tests/cluster.rs index 01394fda..f89d586b 100644 --- a/crates/fila-e2e/tests/cluster.rs +++ b/crates/fila-e2e/tests/cluster.rs @@ -398,17 +398,42 @@ async fn cluster_consume_leader_redirect() { .await .expect("enqueue should succeed"); - // Connect consumer to NON-LEADER node. The SDK should detect the - // leader hint in the UNAVAILABLE error and transparently reconnect - // to the leader node. + // Connect consumer to NON-LEADER node. The server responds with + // NotLeader error containing the leader address. The client sees + // this as an error item on the stream and reconnects to the leader. let consumer_client = helpers::sdk_client(cluster.binary_addr(non_leader)).await; let mut stream = consumer_client .consume("redirect-q") .await - .expect("consume should succeed after leader redirect"); + .expect("consume call should succeed"); - // We should receive the message despite connecting to a non-leader. - let received = tokio::time::timeout(Duration::from_secs(10), stream.next()) + // First item should be a NotLeader error. Extract leader addr and reconnect. + let first = tokio::time::timeout(Duration::from_secs(5), stream.next()) + .await + .expect("should get response within timeout") + .expect("stream should not be empty"); + match first { + Err(ref e) if format!("{e}").contains("NotLeader") => { + // Expected — reconnect to leader. + } + Ok(msg) => { + // Server might have become leader by the time we connected. + assert_eq!(msg.id, msg_id); + assert_eq!(msg.payload, b"redirect-test"); + return; // Test passes — message received directly. + } + Err(e) => panic!("unexpected error: {e}"), + } + drop(stream); + + // Reconnect to leader and consume. + let leader_client = helpers::sdk_client(cluster.binary_addr(leader)).await; + let mut leader_stream = leader_client + .consume("redirect-q") + .await + .expect("consume on leader should succeed"); + + let received = tokio::time::timeout(Duration::from_secs(10), leader_stream.next()) .await .expect("should receive message within timeout") .expect("stream should not be empty") diff --git a/crates/fila-e2e/tests/crash_recovery.rs b/crates/fila-e2e/tests/crash_recovery.rs index d29f2c80..45754115 100644 --- a/crates/fila-e2e/tests/crash_recovery.rs +++ b/crates/fila-e2e/tests/crash_recovery.rs @@ -24,10 +24,10 @@ async fn e2e_crash_recovery() { } // Kill the server (simulates crash — SIGKILL, not graceful) - let (data_dir, grpc_port, binary_port) = server.kill_and_take_data(); + let (data_dir, port) = server.kill_and_take_data(); - // Restart on the same data directory and ports - let server2 = helpers::TestServer::restart_on(data_dir, grpc_port, binary_port); + // Restart on the same data directory and port + let server2 = helpers::TestServer::restart_on(data_dir, port); let client2 = helpers::sdk_client(server2.binary_addr()).await; // Consume from the queue — all messages should be available diff --git a/crates/fila-e2e/tests/dlq_redrive.rs b/crates/fila-e2e/tests/dlq_redrive.rs index 9fe08475..d96a0a75 100644 --- a/crates/fila-e2e/tests/dlq_redrive.rs +++ b/crates/fila-e2e/tests/dlq_redrive.rs @@ -20,16 +20,18 @@ async fn e2e_dlq_redrive() { None, ); - let client = helpers::sdk_client(server.binary_addr()).await; + let enqueue_client = helpers::sdk_client(server.binary_addr()).await; // Enqueue a message - let msg_id = client + let msg_id = enqueue_client .enqueue("redrive-src", HashMap::new(), b"redrive-me".to_vec()) .await .unwrap(); // Nack until DLQ (attempts: nack→on_failure(1)→retry, nack→on_failure(2)→DLQ) - let mut stream = client.consume("redrive-src").await.unwrap(); + // Use a dedicated client per consume to avoid stale consumer state. + let consume_client1 = helpers::sdk_client(server.binary_addr()).await; + let mut stream = consume_client1.consume("redrive-src").await.unwrap(); // First delivery: attempt_count=0, nack → on_failure gets attempts=1 → retry let msg = tokio::time::timeout(Duration::from_secs(5), stream.next()) @@ -38,7 +40,7 @@ async fn e2e_dlq_redrive() { .unwrap() .unwrap(); assert_eq!(msg.id, msg_id); - client + consume_client1 .nack("redrive-src", &msg_id, "failing") .await .unwrap(); @@ -50,20 +52,21 @@ async fn e2e_dlq_redrive() { .unwrap() .unwrap(); assert_eq!(msg.id, msg_id); - client + consume_client1 .nack("redrive-src", &msg_id, "failing again") .await .unwrap(); - // Drop the source stream so the scheduler knows the consumer is gone. - // This prevents the redriven message from being delivered to the old stream. + // Drop consumer — closes TCP connection which unregisters on server side. drop(stream); + drop(consume_client1); - // Give the scheduler a moment to route to DLQ and unregister the consumer + // Give the scheduler a moment to route to DLQ and detect disconnected consumer tokio::time::sleep(Duration::from_millis(500)).await; // Verify DLQ has the message by consuming from it - let mut dlq_stream = client.consume("redrive-src.dlq").await.unwrap(); + let dlq_client = helpers::sdk_client(server.binary_addr()).await; + let mut dlq_stream = dlq_client.consume("redrive-src.dlq").await.unwrap(); let dlq_msg = tokio::time::timeout(Duration::from_secs(5), dlq_stream.next()) .await .expect("timeout waiting for DLQ message") @@ -72,8 +75,12 @@ async fn e2e_dlq_redrive() { assert_eq!(dlq_msg.id, msg_id); // Ack the DLQ message so it's no longer in-flight (redrive only moves pending messages) - client.ack("redrive-src.dlq", &dlq_msg.id).await.unwrap(); + dlq_client + .ack("redrive-src.dlq", &dlq_msg.id) + .await + .unwrap(); drop(dlq_stream); + drop(dlq_client); // Re-enqueue to DLQ for redrive (the original message was acked from DLQ above, // so we need a fresh message in the DLQ to redrive) @@ -81,12 +88,14 @@ async fn e2e_dlq_redrive() { // The simplest approach: re-do the whole flow with a fresh message. // Enqueue a new message and drive it to DLQ - let msg_id2 = client + let msg_id2 = enqueue_client .enqueue("redrive-src", HashMap::new(), b"redrive-me-2".to_vec()) .await .unwrap(); - let mut stream2 = client.consume("redrive-src").await.unwrap(); + // Use a fresh client to avoid stale consumer state from the first consume. + let client2 = helpers::sdk_client(server.binary_addr()).await; + let mut stream2 = client2.consume("redrive-src").await.unwrap(); for _ in 0..2 { let msg = tokio::time::timeout(Duration::from_secs(5), stream2.next()) .await @@ -94,12 +103,13 @@ async fn e2e_dlq_redrive() { .unwrap() .unwrap(); assert_eq!(msg.id, msg_id2); - client + client2 .nack("redrive-src", &msg_id2, "failing") .await .unwrap(); } drop(stream2); + drop(client2); tokio::time::sleep(Duration::from_millis(500)).await; // Now redrive via CLI — the message is pending (not in-flight) in the DLQ @@ -112,7 +122,8 @@ async fn e2e_dlq_redrive() { ); // Consume from source queue — message should be available with reset attempt count - let mut stream3 = client.consume("redrive-src").await.unwrap(); + let consume_client3 = helpers::sdk_client(server.binary_addr()).await; + let mut stream3 = consume_client3.consume("redrive-src").await.unwrap(); let msg = tokio::time::timeout(Duration::from_secs(5), stream3.next()) .await .expect("timeout waiting for redriven message") @@ -127,5 +138,5 @@ async fn e2e_dlq_redrive() { assert_eq!(msg.payload, b"redrive-me-2"); // Clean up - client.ack("redrive-src", &msg.id).await.unwrap(); + consume_client3.ack("redrive-src", &msg.id).await.unwrap(); } diff --git a/crates/fila-e2e/tests/helpers/cluster.rs b/crates/fila-e2e/tests/helpers/cluster.rs index 7a2e10c6..0d4f78b8 100644 --- a/crates/fila-e2e/tests/helpers/cluster.rs +++ b/crates/fila-e2e/tests/helpers/cluster.rs @@ -11,19 +11,17 @@ use std::path::PathBuf; /// A running 3-node fila-server cluster for e2e testing. /// /// Spawns multiple `fila-server` processes with cluster configuration. -/// Each node has distinct client (gRPC), binary protocol, and cluster ports. +/// Each node has distinct client (binary protocol) and cluster ports. /// Node 0 bootstraps the cluster (`peers = []`); the rest join via node 0's cluster port. pub struct TestCluster { nodes: Vec>, client_ports: Vec, - binary_ports: Vec, cluster_ports: Vec, } struct ClusterNode { child: Child, addr: String, - binary_addr: String, data_dir: tempfile::TempDir, } @@ -33,13 +31,12 @@ impl TestCluster { assert!(n >= 1, "cluster must have at least 1 node"); let client_ports: Vec = (0..n).map(|_| free_port()).collect(); - let binary_ports: Vec = (0..n).map(|_| free_port()).collect(); let cluster_ports: Vec = (0..n).map(|_| free_port()).collect(); let mut nodes = Vec::with_capacity(n); // Start node 0 first (bootstrap). Give it a moment to initialize Raft. - let node0 = start_cluster_node(0, client_ports[0], binary_ports[0], cluster_ports[0], &[]); + let node0 = start_cluster_node(0, client_ports[0], cluster_ports[0], &[]); nodes.push(Some(node0)); // Brief delay to let node 0 bootstrap its single-node Raft cluster @@ -48,13 +45,7 @@ impl TestCluster { // Start remaining nodes that join via node 0 let seed = format!("127.0.0.1:{}", cluster_ports[0]); for i in 1..n { - let node = start_cluster_node( - i, - client_ports[i], - binary_ports[i], - cluster_ports[i], - &[&seed], - ); + let node = start_cluster_node(i, client_ports[i], cluster_ports[i], &[&seed]); nodes.push(Some(node)); } @@ -64,27 +55,23 @@ impl TestCluster { TestCluster { nodes, client_ports, - binary_ports, cluster_ports, } } - /// Get the HTTP address of node `i` (e.g., "http://127.0.0.1:12345"). - /// Used by gRPC clients. + /// Get the binary protocol address of node `i` (e.g., "127.0.0.1:12345"). + /// Used by SDK and CLI connections. pub fn addr(&self, i: usize) -> &str { &self.nodes[i].as_ref().expect("node not running").addr } - /// Get the binary protocol address of node `i` (e.g., "127.0.0.1:12346"). - /// Used by SDK connections. + /// Get the binary protocol address of node `i`. + /// Alias for `addr()` for backward compatibility. pub fn binary_addr(&self, i: usize) -> &str { - &self.nodes[i] - .as_ref() - .expect("node not running") - .binary_addr + self.addr(i) } - /// Get the raw host:port of node `i` (gRPC port). + /// Get the raw host:port of node `i`. pub fn host_port(&self, i: usize) -> String { format!("127.0.0.1:{}", self.client_ports[i]) } @@ -99,7 +86,6 @@ impl TestCluster { self.nodes[i] = Some(ClusterNode { child: node.child, addr: node.addr, - binary_addr: node.binary_addr, data_dir: node.data_dir, }); } @@ -122,7 +108,6 @@ impl TestCluster { let node = restart_cluster_node( i, self.client_ports[i], - self.binary_ports[i], self.cluster_ports[i], &seed_refs, data_dir, @@ -148,53 +133,35 @@ impl Drop for TestCluster { fn start_cluster_node( index: usize, client_port: u16, - binary_port: u16, cluster_port: u16, seeds: &[&str], ) -> ClusterNode { let data_dir = tempfile::tempdir().expect("create temp dir"); - write_cluster_config( - &data_dir, - index, - client_port, - binary_port, - cluster_port, - seeds, - ); - spawn_and_wait(data_dir, client_port, binary_port) + write_cluster_config(&data_dir, index, client_port, cluster_port, seeds); + spawn_and_wait(data_dir, client_port) } fn restart_cluster_node( index: usize, client_port: u16, - binary_port: u16, cluster_port: u16, seeds: &[&str], data_dir: tempfile::TempDir, ) -> ClusterNode { // Rewrite config (it's already there but let's be safe) - write_cluster_config( - &data_dir, - index, - client_port, - binary_port, - cluster_port, - seeds, - ); - spawn_and_wait(data_dir, client_port, binary_port) + write_cluster_config(&data_dir, index, client_port, cluster_port, seeds); + spawn_and_wait(data_dir, client_port) } fn write_cluster_config( data_dir: &tempfile::TempDir, index: usize, client_port: u16, - binary_port: u16, cluster_port: u16, seeds: &[&str], ) { let node_id = index as u64 + 1; // node IDs are 1-based let addr = format!("127.0.0.1:{client_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); let bind_addr = format!("127.0.0.1:{cluster_port}"); let peers_toml = if seeds.is_empty() { @@ -207,7 +174,6 @@ fn write_cluster_config( let config = format!( r#"[server] listen_addr = "{addr}" -binary_addr = "{binary_addr}" [telemetry] otlp_endpoint = "" @@ -226,9 +192,8 @@ heartbeat_interval_ms = 150 std::fs::write(&config_path, config).expect("write cluster config"); } -fn spawn_and_wait(data_dir: tempfile::TempDir, client_port: u16, binary_port: u16) -> ClusterNode { - let grpc_addr = format!("127.0.0.1:{client_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); +fn spawn_and_wait(data_dir: tempfile::TempDir, client_port: u16) -> ClusterNode { + let addr = format!("127.0.0.1:{client_port}"); let binary = server_binary(); assert!( binary.exists(), @@ -250,31 +215,22 @@ fn spawn_and_wait(data_dir: tempfile::TempDir, client_port: u16, binary_port: u1 let stderr = child.stderr.take().expect("stderr"); std::thread::spawn(move || for _ in BufReader::new(stderr).lines() {}); - // Poll TCP until both ports are reachable + // Poll TCP until port is reachable let start = std::time::Instant::now(); - let mut grpc_ok = false; - let mut binary_ok = false; while start.elapsed() < Duration::from_secs(15) { - if !grpc_ok && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_ok = true; - } - if !binary_ok && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_ok = true; - } - if grpc_ok && binary_ok { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_ok && binary_ok, - "cluster node did not become reachable at gRPC={grpc_addr} binary={binary_addr} within 15s" + std::net::TcpStream::connect(&addr).is_ok(), + "cluster node did not become reachable at {addr} within 15s" ); ClusterNode { child, - addr: format!("http://{grpc_addr}"), - binary_addr, + addr, data_dir, } } diff --git a/crates/fila-e2e/tests/helpers/mod.rs b/crates/fila-e2e/tests/helpers/mod.rs index 058b6f8f..822f3ee8 100644 --- a/crates/fila-e2e/tests/helpers/mod.rs +++ b/crates/fila-e2e/tests/helpers/mod.rs @@ -14,9 +14,8 @@ use std::time::Duration; /// The server is killed when this struct is dropped. pub struct TestServer { child: Option, - addr: String, /// Binary protocol address (host:port, no scheme prefix). - binary_addr: String, + addr: String, /// Kept alive for the duration of the test. When dropped, the temp dir is cleaned up. /// `None` after `kill_and_take_data()` transfers ownership. data_dir: Option, @@ -37,10 +36,8 @@ impl TestServer { addr: String, data_dir: tempfile::TempDir, ) -> Self { - // When constructing from parts (e.g. TLS tests), the binary_addr defaults - // to the same as addr but without scheme, since those tests manage their - // own config. The SDK connects via binary_addr. - let binary_addr = addr + // Strip any scheme prefix — the binary protocol uses raw host:port. + let addr = addr .strip_prefix("http://") .or_else(|| addr.strip_prefix("https://")) .unwrap_or(&addr) @@ -48,7 +45,6 @@ impl TestServer { Self { child: Some(child), addr, - binary_addr, data_dir: Some(data_dir), } } @@ -67,13 +63,8 @@ impl TestServer { /// Start a new fila-server instance with custom options. fn start_with_options(opts: TestServerOptions) -> Self { - let grpc_port = free_port(); - let mut binary_port = free_port(); - while binary_port == grpc_port { - binary_port = free_port(); - } - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let port = free_port(); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("create temp dir"); let config_path = data_dir.path().join("fila.toml"); @@ -84,8 +75,7 @@ impl TestServer { }; let config_content = format!( r#"[server] -listen_addr = "{grpc_addr}" -binary_addr = "{binary_addr}" +listen_addr = "{addr}" {scheduler_section} [telemetry] otlp_endpoint = "" @@ -116,86 +106,61 @@ otlp_endpoint = "" let stderr = child.stderr.take().expect("stderr"); std::thread::spawn(move || for _ in BufReader::new(stderr).lines() {}); - // Poll TCP until both the gRPC and binary protocol ports are reachable. + // Poll TCP until the port is reachable. let start = std::time::Instant::now(); - let mut grpc_connected = false; - let mut binary_connected = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_connected && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_connected = true; - } - if !binary_connected && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_connected = true; - } - if grpc_connected && binary_connected { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_connected && binary_connected, - "fila-server did not become reachable at gRPC={grpc_addr} binary={binary_addr} within 10s" + std::net::TcpStream::connect(&addr).is_ok(), + "fila-server did not become reachable at {addr} within 10s" ); Self { child: Some(child), - addr: format!("http://{grpc_addr}"), - binary_addr, + addr, data_dir: Some(data_dir), } } - /// The HTTP address of the running server (e.g., "http://127.0.0.1:12345"). - /// Used by gRPC clients. + /// The binary protocol address (host:port) for SDK and CLI connections. pub fn addr(&self) -> &str { &self.addr } /// The binary protocol address (host:port) for SDK connections. + /// Alias for `addr()` for backward compatibility. pub fn binary_addr(&self) -> &str { - &self.binary_addr + &self.addr } - /// The raw host:port address (without http:// prefix) for the gRPC port. + /// The raw host:port address (without any prefix). pub fn host_port(&self) -> &str { - self.addr.strip_prefix("http://").unwrap_or(&self.addr) + &self.addr } /// Kill the server and return the data directory for restarting on the same data. /// This simulates a crash — the server is killed with SIGKILL. - pub fn kill_and_take_data(mut self) -> (tempfile::TempDir, u16, u16) { + pub fn kill_and_take_data(mut self) -> (tempfile::TempDir, u16) { if let Some(mut child) = self.child.take() { let _ = child.kill(); let _ = child.wait(); } - let grpc_port = self.grpc_port(); - let binary_port = self.binary_port(); + let port = self.port(); let data_dir = self.data_dir.take().expect("data_dir already taken"); - (data_dir, grpc_port, binary_port) + (data_dir, port) } - fn grpc_port(&self) -> u16 { - self.host_port() - .split(':') - .next_back() - .unwrap() - .parse() - .unwrap() + fn port(&self) -> u16 { + self.addr.split(':').next_back().unwrap().parse().unwrap() } - fn binary_port(&self) -> u16 { - self.binary_addr - .split(':') - .next_back() - .unwrap() - .parse() - .unwrap() - } - - /// Restart a server on the same data directory and ports. - pub fn restart_on(data_dir: tempfile::TempDir, grpc_port: u16, binary_port: u16) -> Self { - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + /// Restart a server on the same data directory and port. + pub fn restart_on(data_dir: tempfile::TempDir, port: u16) -> Self { + let addr = format!("127.0.0.1:{port}"); let binary = server_binary(); assert!( @@ -221,29 +186,20 @@ otlp_endpoint = "" std::thread::spawn(move || for _ in BufReader::new(stderr).lines() {}); let start = std::time::Instant::now(); - let mut grpc_connected = false; - let mut binary_connected = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_connected && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_connected = true; - } - if !binary_connected && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_connected = true; - } - if grpc_connected && binary_connected { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_connected && binary_connected, - "fila-server did not become reachable at gRPC={grpc_addr} binary={binary_addr} within 10s after restart" + std::net::TcpStream::connect(&addr).is_ok(), + "fila-server did not become reachable at {addr} within 10s after restart" ); Self { child: Some(child), - addr: format!("http://{grpc_addr}"), - binary_addr, + addr, data_dir: Some(data_dir), } } @@ -337,16 +293,14 @@ pub const TEST_BOOTSTRAP_KEY: &str = "test-bootstrap-key-for-e2e"; /// Start a fila-server with API key authentication enabled. /// -/// Returns (TestServer, http_addr). Use `TEST_BOOTSTRAP_KEY` as the initial credential. +/// Returns (TestServer, addr). Use `TEST_BOOTSTRAP_KEY` as the initial credential. pub fn start_auth_server() -> (TestServer, String) { - let grpc_port = free_port(); - let binary_port = free_port(); - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let port = free_port(); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("temp dir"); let config_content = format!( - "[server]\nlisten_addr = \"{grpc_addr}\"\nbinary_addr = \"{binary_addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[auth]\nbootstrap_apikey = \"{TEST_BOOTSTRAP_KEY}\"\n" + "[server]\nlisten_addr = \"{addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[auth]\nbootstrap_apikey = \"{TEST_BOOTSTRAP_KEY}\"\n" ); let config_path = data_dir.path().join("fila.toml"); std::fs::write(&config_path, config_content).expect("write config"); @@ -376,33 +330,23 @@ pub fn start_auth_server() -> (TestServer, String) { // Poll TCP until the server is reachable. let start = std::time::Instant::now(); - let mut grpc_connected = false; - let mut binary_connected = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_connected && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_connected = true; - } - if !binary_connected && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_connected = true; - } - if grpc_connected && binary_connected { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_connected && binary_connected, - "fila-server (auth mode) did not become reachable at gRPC={grpc_addr} binary={binary_addr} within 10s" + std::net::TcpStream::connect(&addr).is_ok(), + "fila-server (auth mode) did not become reachable at {addr} within 10s" ); - let http_addr = format!("http://{grpc_addr}"); let server = TestServer { child: Some(child), - addr: http_addr.clone(), - binary_addr, + addr: addr.clone(), data_dir: Some(data_dir), }; - (server, http_addr) + (server, addr) } /// Create a superadmin API key via CLI and return (key_id, token). diff --git a/crates/fila-e2e/tests/tls.rs b/crates/fila-e2e/tests/tls.rs index e4542ed6..e264947b 100644 --- a/crates/fila-e2e/tests/tls.rs +++ b/crates/fila-e2e/tests/tls.rs @@ -19,23 +19,18 @@ fn generate_self_signed_cert() -> (Vec, Vec) { (cert_pem, key_pem) } -/// Start fila-server with TLS enabled (both gRPC and binary protocol listeners). +/// Start fila-server with TLS enabled. /// -/// Returns (TestServer, binary_addr, cert_pem) where `cert_pem` is both the +/// Returns (TestServer, addr, cert_pem) where `cert_pem` is both the /// server certificate and the CA cert to use for client verification. fn start_tls_server() -> (TestServer, String, Vec) { use std::net::TcpListener; - let grpc_port = { + let port = { let l = TcpListener::bind("127.0.0.1:0").expect("bind"); l.local_addr().unwrap().port() }; - let binary_port = { - let l = TcpListener::bind("127.0.0.1:0").expect("bind"); - l.local_addr().unwrap().port() - }; - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let addr = format!("127.0.0.1:{port}"); let (cert_pem, key_pem) = generate_self_signed_cert(); @@ -46,7 +41,7 @@ fn start_tls_server() -> (TestServer, String, Vec) { std::fs::write(&key_path, &key_pem).expect("write key"); let config_content = format!( - "[server]\nlisten_addr = \"{grpc_addr}\"\nbinary_addr = \"{binary_addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\n", + "[server]\nlisten_addr = \"{addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\n", cert = cert_path.to_str().unwrap(), key = key_path.to_str().unwrap(), ); @@ -84,40 +79,31 @@ fn start_tls_server() -> (TestServer, String, Vec) { let stderr = child.stderr.take().expect("stderr"); std::thread::spawn(move || for _ in std::io::BufReader::new(stderr).lines() {}); - // Poll TCP until both listeners are reachable. + // Poll TCP until the listener is reachable. let start = std::time::Instant::now(); - let mut grpc_ok = false; - let mut binary_ok = false; while start.elapsed() < std::time::Duration::from_secs(10) { - if !grpc_ok && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_ok = true; - } - if !binary_ok && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_ok = true; - } - if grpc_ok && binary_ok { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(std::time::Duration::from_millis(50)); } assert!( - grpc_ok && binary_ok, - "TLS fila-server did not become reachable within 10s (grpc={grpc_ok}, binary={binary_ok})" + std::net::TcpStream::connect(&addr).is_ok(), + "TLS fila-server did not become reachable within 10s" ); - let https_addr = format!("https://{grpc_addr}"); - let server = TestServer::from_parts(child, https_addr, data_dir); - (server, binary_addr, cert_pem) + let server = TestServer::from_parts(child, addr.clone(), data_dir); + (server, addr, cert_pem) } #[tokio::test] async fn tls_valid_cert_connects_successfully() { - let (_server, binary_addr, ca_pem) = start_tls_server(); + let (_server, addr, ca_pem) = start_tls_server(); // The cert is self-signed, so using the cert itself as the CA cert lets // the client verify the server's identity. let result = fila_sdk::FilaClient::connect_with_options( - fila_sdk::ConnectOptions::new(&binary_addr).with_tls_ca_cert(ca_pem), + fila_sdk::ConnectOptions::new(&addr).with_tls_ca_cert(ca_pem), ) .await; @@ -129,10 +115,10 @@ async fn tls_valid_cert_connects_successfully() { #[tokio::test] async fn tls_plaintext_client_is_rejected() { - let (_server, binary_addr, _ca_pem) = start_tls_server(); + let (_server, addr, _ca_pem) = start_tls_server(); // Connect without TLS — the server speaks TLS, so the handshake must fail. - let connect_result = fila_sdk::FilaClient::connect(&binary_addr).await; + let connect_result = fila_sdk::FilaClient::connect(&addr).await; // With the binary protocol, a plaintext connection to a TLS server should // fail during the protocol handshake (the server expects a TLS ClientHello). @@ -144,13 +130,13 @@ async fn tls_plaintext_client_is_rejected() { #[tokio::test] async fn tls_wrong_ca_cert_is_rejected() { - let (_server, binary_addr, _correct_ca_pem) = start_tls_server(); + let (_server, addr, _correct_ca_pem) = start_tls_server(); // Generate an entirely different cert/key pair; its cert is a different CA. let (wrong_ca_pem, _) = generate_self_signed_cert(); let connect_result = fila_sdk::FilaClient::connect_with_options( - fila_sdk::ConnectOptions::new(&binary_addr).with_tls_ca_cert(wrong_ca_pem), + fila_sdk::ConnectOptions::new(&addr).with_tls_ca_cert(wrong_ca_pem), ) .await; diff --git a/crates/fila-e2e/tests/tls_mtls.rs b/crates/fila-e2e/tests/tls_mtls.rs index a9cbcaf8..0f5dad16 100644 --- a/crates/fila-e2e/tests/tls_mtls.rs +++ b/crates/fila-e2e/tests/tls_mtls.rs @@ -66,7 +66,7 @@ fn generate_mtls_certs() -> (Vec, Vec, Vec, Vec, Vec) { /// Start fila-server with mTLS enabled (server cert + client CA verification). /// -/// Returns (TestServer, binary_addr_string). +/// Returns (TestServer, addr). fn start_mtls_server( ca_pem: &[u8], server_cert_pem: &[u8], @@ -74,16 +74,11 @@ fn start_mtls_server( ) -> (helpers::TestServer, String) { use std::net::TcpListener; - let grpc_port = { + let port = { let l = TcpListener::bind("127.0.0.1:0").expect("bind"); l.local_addr().unwrap().port() }; - let binary_port = { - let l = TcpListener::bind("127.0.0.1:0").expect("bind"); - l.local_addr().unwrap().port() - }; - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("temp dir"); let cert_path = data_dir.path().join("server.crt"); @@ -94,7 +89,7 @@ fn start_mtls_server( std::fs::write(&ca_path, ca_pem).expect("write ca"); let config_content = format!( - "[server]\nlisten_addr = \"{grpc_addr}\"\nbinary_addr = \"{binary_addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\nca_file = \"{ca}\"\n", + "[server]\nlisten_addr = \"{addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\nca_file = \"{ca}\"\n", cert = cert_path.to_str().unwrap(), key = key_path.to_str().unwrap(), ca = ca_path.to_str().unwrap(), @@ -134,28 +129,19 @@ fn start_mtls_server( std::thread::spawn(move || for _ in std::io::BufReader::new(stderr).lines() {}); let start = std::time::Instant::now(); - let mut grpc_ok = false; - let mut binary_ok = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_ok && std::net::TcpStream::connect(&grpc_addr).is_ok() { - grpc_ok = true; - } - if !binary_ok && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_ok = true; - } - if grpc_ok && binary_ok { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_ok && binary_ok, + std::net::TcpStream::connect(&addr).is_ok(), "mTLS fila-server did not become reachable within 10s" ); - let https_addr = format!("https://{grpc_addr}"); - let server = helpers::TestServer::from_parts(child, https_addr, data_dir); - (server, binary_addr) + let server = helpers::TestServer::from_parts(child, addr.clone(), data_dir); + (server, addr) } /// AC 1: mTLS with valid client certificate → connection succeeds, enqueue works. @@ -164,10 +150,10 @@ async fn mtls_valid_client_cert_succeeds() { let (ca_pem, server_cert_pem, server_key_pem, client_cert_pem, client_key_pem) = generate_mtls_certs(); - let (_server, binary_addr) = start_mtls_server(&ca_pem, &server_cert_pem, &server_key_pem); + let (_server, addr) = start_mtls_server(&ca_pem, &server_cert_pem, &server_key_pem); let client = fila_sdk::FilaClient::connect_with_options( - fila_sdk::ConnectOptions::new(&binary_addr) + fila_sdk::ConnectOptions::new(&addr) .with_tls_ca_cert(ca_pem.clone()) .with_tls_identity(client_cert_pem, client_key_pem), ) @@ -200,13 +186,13 @@ async fn mtls_valid_client_cert_succeeds() { async fn mtls_no_client_cert_rejected() { let (ca_pem, server_cert_pem, server_key_pem, _, _) = generate_mtls_certs(); - let (_server, binary_addr) = start_mtls_server(&ca_pem, &server_cert_pem, &server_key_pem); + let (_server, addr) = start_mtls_server(&ca_pem, &server_cert_pem, &server_key_pem); // Connect WITH CA cert (server verification) but WITHOUT client cert. // With the binary protocol, the TLS handshake is eager, so connect() itself // should fail when the server requires client certificates. let connect_result = fila_sdk::FilaClient::connect_with_options( - fila_sdk::ConnectOptions::new(&binary_addr).with_tls_ca_cert(ca_pem), + fila_sdk::ConnectOptions::new(&addr).with_tls_ca_cert(ca_pem), ) .await; @@ -241,16 +227,11 @@ async fn tls_expired_cert_rejected() { // Start server with the expired cert (no ca_file — one-way TLS, not mTLS) use std::net::TcpListener; - let grpc_port = { - let l = TcpListener::bind("127.0.0.1:0").expect("bind"); - l.local_addr().unwrap().port() - }; - let binary_port = { + let port = { let l = TcpListener::bind("127.0.0.1:0").expect("bind"); l.local_addr().unwrap().port() }; - let grpc_addr = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("temp dir"); let cert_path = data_dir.path().join("server.crt"); let key_path = data_dir.path().join("server.key"); @@ -258,7 +239,7 @@ async fn tls_expired_cert_rejected() { std::fs::write(&key_path, &server_key_pem).expect("write key"); let config_content = format!( - "[server]\nlisten_addr = \"{grpc_addr}\"\nbinary_addr = \"{binary_addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\n", + "[server]\nlisten_addr = \"{addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n\n[tls]\ncert_file = \"{cert}\"\nkey_file = \"{key}\"\n", cert = cert_path.to_str().unwrap(), key = key_path.to_str().unwrap(), ); @@ -293,7 +274,7 @@ async fn tls_expired_cert_rejected() { let start = std::time::Instant::now(); let mut connected = false; while start.elapsed() < Duration::from_secs(10) { - if std::net::TcpStream::connect(&binary_addr).is_ok() { + if std::net::TcpStream::connect(&addr).is_ok() { connected = true; break; } @@ -301,14 +282,14 @@ async fn tls_expired_cert_rejected() { } assert!( connected, - "fila-server with expired cert did not become reachable at {binary_addr} within 10s" + "fila-server with expired cert did not become reachable at {addr} within 10s" ); - let _server = helpers::TestServer::from_parts(child, format!("https://{grpc_addr}"), data_dir); + let _server = helpers::TestServer::from_parts(child, addr.clone(), data_dir); // Client should reject expired cert during TLS handshake. let connect_result = fila_sdk::FilaClient::connect_with_options( - fila_sdk::ConnectOptions::new(&binary_addr).with_tls_ca_cert(ca_pem), + fila_sdk::ConnectOptions::new(&addr).with_tls_ca_cert(ca_pem), ) .await; diff --git a/crates/fila-proto/Cargo.toml b/crates/fila-proto/Cargo.toml index c209a3bf..1c6c14c8 100644 --- a/crates/fila-proto/Cargo.toml +++ b/crates/fila-proto/Cargo.toml @@ -5,17 +5,15 @@ edition.workspace = true license.workspace = true repository.workspace = true rust-version.workspace = true -description = "Generated protobuf and gRPC types for Fila" +description = "Generated protobuf types for Fila (Raft log serialization)" homepage.workspace = true -keywords = ["message-broker", "queue", "grpc", "protobuf"] +keywords = ["message-broker", "queue", "protobuf"] categories = ["network-programming"] include = ["src/**/*", "proto/**/*", "build.rs", "Cargo.toml", "LICENSE"] [dependencies] -tonic.workspace = true -tonic-prost.workspace = true prost.workspace = true prost-types.workspace = true [build-dependencies] -tonic-prost-build.workspace = true +prost-build.workspace = true diff --git a/crates/fila-proto/build.rs b/crates/fila-proto/build.rs index 8891dee8..ee01f09c 100644 --- a/crates/fila-proto/build.rs +++ b/crates/fila-proto/build.rs @@ -6,8 +6,6 @@ fn main() -> Result<(), Box> { let proto_files = [ proto_root.join("fila/v1/messages.proto"), - proto_root.join("fila/v1/service.proto"), - proto_root.join("fila/v1/admin.proto"), proto_root.join("fila/v1/cluster.proto"), ]; @@ -16,7 +14,7 @@ fn main() -> Result<(), Box> { println!("cargo:rerun-if-changed={}", proto.display()); } - tonic_prost_build::configure().compile_protos( + prost_build::compile_protos( &proto_files .iter() .map(|p| p.to_string_lossy().into_owned()) diff --git a/crates/fila-proto/proto/fila/v1/admin.proto b/crates/fila-proto/proto/fila/v1/admin.proto deleted file mode 100644 index 886e58dc..00000000 --- a/crates/fila-proto/proto/fila/v1/admin.proto +++ /dev/null @@ -1,197 +0,0 @@ -syntax = "proto3"; -package fila.v1; - -// Admin RPCs for operators and the CLI. -service FilaAdmin { - rpc CreateQueue(CreateQueueRequest) returns (CreateQueueResponse); - rpc DeleteQueue(DeleteQueueRequest) returns (DeleteQueueResponse); - rpc SetConfig(SetConfigRequest) returns (SetConfigResponse); - rpc GetConfig(GetConfigRequest) returns (GetConfigResponse); - rpc ListConfig(ListConfigRequest) returns (ListConfigResponse); - rpc GetStats(GetStatsRequest) returns (GetStatsResponse); - rpc Redrive(RedriveRequest) returns (RedriveResponse); - rpc ListQueues(ListQueuesRequest) returns (ListQueuesResponse); - - // API key management. CreateApiKey bypasses auth (bootstrap); others require a valid key. - rpc CreateApiKey(CreateApiKeyRequest) returns (CreateApiKeyResponse); - rpc RevokeApiKey(RevokeApiKeyRequest) returns (RevokeApiKeyResponse); - rpc ListApiKeys(ListApiKeysRequest) returns (ListApiKeysResponse); - - // Per-key ACL management. - rpc SetAcl(SetAclRequest) returns (SetAclResponse); - rpc GetAcl(GetAclRequest) returns (GetAclResponse); -} - -message CreateQueueRequest { - string name = 1; - QueueConfig config = 2; -} - -message QueueConfig { - string on_enqueue_script = 1; - string on_failure_script = 2; - uint64 visibility_timeout_ms = 3; -} - -message CreateQueueResponse { - string queue_id = 1; -} - -message DeleteQueueRequest { - string queue = 1; -} - -message DeleteQueueResponse {} - -message SetConfigRequest { - string key = 1; - string value = 2; -} - -message SetConfigResponse {} - -message GetConfigRequest { - string key = 1; -} - -message GetConfigResponse { - string value = 1; -} - -message ConfigEntry { - string key = 1; - string value = 2; -} - -message ListConfigRequest { - string prefix = 1; -} - -message ListConfigResponse { - repeated ConfigEntry entries = 1; - uint32 total_count = 2; -} - -message GetStatsRequest { - string queue = 1; -} - -message PerFairnessKeyStats { - string key = 1; - uint64 pending_count = 2; - int64 current_deficit = 3; - uint32 weight = 4; -} - -message PerThrottleKeyStats { - string key = 1; - double tokens = 2; - double rate_per_second = 3; - double burst = 4; -} - -message GetStatsResponse { - uint64 depth = 1; - uint64 in_flight = 2; - uint64 active_fairness_keys = 3; - uint32 active_consumers = 4; - uint32 quantum = 5; - repeated PerFairnessKeyStats per_key_stats = 6; - repeated PerThrottleKeyStats per_throttle_stats = 7; - // Cluster fields (0 when not in cluster mode). - uint64 leader_node_id = 8; - uint32 replication_count = 9; -} - -message RedriveRequest { - string dlq_queue = 1; - uint64 count = 2; -} - -message RedriveResponse { - uint64 redriven = 1; -} - -message ListQueuesRequest {} - -message QueueInfo { - string name = 1; - uint64 depth = 2; - uint64 in_flight = 3; - uint32 active_consumers = 4; - uint64 leader_node_id = 5; -} - -message ListQueuesResponse { - repeated QueueInfo queues = 1; - uint32 cluster_node_count = 2; -} - -// --- API Key Management --- - -message CreateApiKeyRequest { - /// Human-readable label for the key. - string name = 1; - /// Optional Unix timestamp (milliseconds) after which the key expires. - /// 0 means no expiration. - uint64 expires_at_ms = 2; - /// When true, the key bypasses all ACL checks (superadmin). - bool is_superadmin = 3; -} - -message CreateApiKeyResponse { - /// Opaque key ID for management operations (revoke, list, set-acl). - string key_id = 1; - /// Plaintext API key. Returned once — store it securely. - string key = 2; - /// Whether this key has superadmin privileges. - bool is_superadmin = 3; -} - -message RevokeApiKeyRequest { - string key_id = 1; -} - -message RevokeApiKeyResponse {} - -message ListApiKeysRequest {} - -message ApiKeyInfo { - string key_id = 1; - string name = 2; - uint64 created_at_ms = 3; - /// 0 means no expiration. - uint64 expires_at_ms = 4; - bool is_superadmin = 5; -} - -message ListApiKeysResponse { - repeated ApiKeyInfo keys = 1; -} - -// --- ACL Management --- - -/// A single permission grant: kind (produce/consume/admin) + queue pattern. -message AclPermission { - /// One of: "produce", "consume", "admin". - string kind = 1; - /// Queue name or wildcard ("*" or "orders.*"). - string pattern = 2; -} - -message SetAclRequest { - string key_id = 1; - repeated AclPermission permissions = 2; -} - -message SetAclResponse {} - -message GetAclRequest { - string key_id = 1; -} - -message GetAclResponse { - string key_id = 1; - repeated AclPermission permissions = 2; - bool is_superadmin = 3; -} diff --git a/crates/fila-proto/proto/fila/v1/cluster.proto b/crates/fila-proto/proto/fila/v1/cluster.proto index 756d834c..874ba51c 100644 --- a/crates/fila-proto/proto/fila/v1/cluster.proto +++ b/crates/fila-proto/proto/fila/v1/cluster.proto @@ -4,34 +4,6 @@ package fila.v1; import "fila/v1/messages.proto"; -/// Internal cluster gRPC service for Raft consensus communication. -/// This service runs on a separate port from the client-facing service. -service FilaCluster { - /// Forward a Raft AppendEntries RPC. - rpc AppendEntries(RaftAppendEntriesRequest) returns (RaftAppendEntriesResponse); - - /// Forward a Raft Vote RPC. - rpc Vote(RaftVoteRequest) returns (RaftVoteResponse); - - /// Forward a Raft InstallSnapshot RPC. - rpc InstallSnapshot(RaftInstallSnapshotRequest) returns (RaftInstallSnapshotResponse); - - /// Request to add this node to the cluster. - rpc AddNode(AddNodeRequest) returns (AddNodeResponse); - - /// Request to remove a node from the cluster. - rpc RemoveNode(RemoveNodeRequest) returns (RemoveNodeResponse); - - /// Forward a client write to the Raft leader for a specific group. - /// Used for transparent request routing: when a node receives a write - /// request but is not the leader for the target group, it serializes - /// the ClusterRequest and forwards it to the leader via this RPC. - rpc ClientWrite(RaftClientWriteRequest) returns (RaftClientWriteResponse); - - /// Get this node's client-facing gRPC address (for leader hint routing). - rpc GetNodeInfo(GetNodeInfoRequest) returns (GetNodeInfoResponse); -} - // --- Primitive openraft types --- message RaftLeaderId { @@ -228,7 +200,7 @@ message RaftClientWriteResponse { message AddNodeRequest { uint64 node_id = 1; string addr = 2; - /// The joining node's client-facing gRPC address (e.g., "host:5555"). + /// The joining node's client-facing address (e.g., "host:5555"). /// Used for leader hint routing so clients can be directed to the correct /// node for consume operations. string client_addr = 3; @@ -255,6 +227,6 @@ message GetNodeInfoRequest {} message GetNodeInfoResponse { uint64 node_id = 1; - /// This node's client-facing gRPC address (e.g., "host:5555"). + /// This node's client-facing address (e.g., "host:5555"). string client_addr = 2; } diff --git a/crates/fila-proto/proto/fila/v1/service.proto b/crates/fila-proto/proto/fila/v1/service.proto deleted file mode 100644 index f14fdd0f..00000000 --- a/crates/fila-proto/proto/fila/v1/service.proto +++ /dev/null @@ -1,45 +0,0 @@ -syntax = "proto3"; -package fila.v1; - -import "fila/v1/messages.proto"; - -// Hot-path RPCs for producers and consumers. -service FilaService { - rpc Enqueue(EnqueueRequest) returns (EnqueueResponse); - rpc Consume(ConsumeRequest) returns (stream ConsumeResponse); - rpc Ack(AckRequest) returns (AckResponse); - rpc Nack(NackRequest) returns (NackResponse); -} - -message EnqueueRequest { - string queue = 1; - map headers = 2; - bytes payload = 3; -} - -message EnqueueResponse { - string message_id = 1; -} - -message ConsumeRequest { - string queue = 1; -} - -message ConsumeResponse { - Message message = 1; -} - -message AckRequest { - string queue = 1; - string message_id = 2; -} - -message AckResponse {} - -message NackRequest { - string queue = 1; - string message_id = 2; - string error = 3; -} - -message NackResponse {} diff --git a/crates/fila-proto/src/lib.rs b/crates/fila-proto/src/lib.rs index f7011415..cfb0a858 100644 --- a/crates/fila-proto/src/lib.rs +++ b/crates/fila-proto/src/lib.rs @@ -1 +1 @@ -tonic::include_proto!("fila.v1"); +include!(concat!(env!("OUT_DIR"), "/fila.v1.rs")); diff --git a/crates/fila-sdk/Cargo.toml b/crates/fila-sdk/Cargo.toml index 36c7217f..3292d6c1 100644 --- a/crates/fila-sdk/Cargo.toml +++ b/crates/fila-sdk/Cargo.toml @@ -10,6 +10,9 @@ homepage.workspace = true keywords = ["message-broker", "queue", "binary-protocol", "client"] categories = ["network-programming", "api-bindings"] +[features] +test-internals = [] + [dependencies] fila-fibp.workspace = true tokio = { workspace = true, features = ["net", "io-util", "sync", "rt", "macros"] } @@ -25,3 +28,4 @@ thiserror.workspace = true [dev-dependencies] tokio = { workspace = true, features = ["full"] } tempfile = "3" +fila-sdk = { path = ".", features = ["test-internals"] } diff --git a/crates/fila-sdk/src/client.rs b/crates/fila-sdk/src/client.rs index f4457183..587fbcb5 100644 --- a/crates/fila-sdk/src/client.rs +++ b/crates/fila-sdk/src/client.rs @@ -97,6 +97,11 @@ pub struct ConnectOptions { /// API key for authenticating with the broker. /// When set, the key is sent in the Handshake frame during connection. pub api_key: Option, + /// Capacity of the delivery channel for consume streams. + /// When the channel is full, deliveries overflow to an internal buffer. + /// Defaults to 256. + #[cfg(feature = "test-internals")] + pub delivery_buffer_size: Option, } impl ConnectOptions { @@ -144,6 +149,13 @@ impl ConnectOptions { self.api_key = Some(api_key.into()); self } + + /// Set the delivery channel buffer size (for testing backpressure behavior). + #[cfg(feature = "test-internals")] + pub fn with_delivery_buffer_size(mut self, size: usize) -> Self { + self.delivery_buffer_size = Some(size); + self + } } /// Strip URL scheme prefixes so addresses like "http://host:port" work. @@ -236,6 +248,10 @@ struct FilaClientInner { state: Arc>, next_request_id: AtomicU32, cancel: CancellationToken, + delivery_buffer_size: usize, + /// Number of messages currently in the delivery overflow buffer. + /// Updated by the background reader (increment) and overflow flush (decrement). + delivery_overflow_len: Arc, } impl Drop for FilaClientInner { @@ -483,11 +499,23 @@ impl FilaClient { stream: write_stream, })); + #[cfg(feature = "test-internals")] + let delivery_buffer_size = connect_options + .as_ref() + .and_then(|o| o.delivery_buffer_size) + .unwrap_or(256); + #[cfg(not(feature = "test-internals"))] + let delivery_buffer_size = 256; + + let delivery_overflow_len = Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let inner = Arc::new(FilaClientInner { writer: Arc::clone(&writer), state: Arc::clone(&state), next_request_id: AtomicU32::new(1), cancel: cancel.clone(), + delivery_buffer_size, + delivery_overflow_len: Arc::clone(&delivery_overflow_len), }); // The background reader holds its own Arcs to state and writer, plus @@ -495,7 +523,7 @@ impl FilaClient { // FilaClient clones drop, FilaClientInner::drop fires and cancels the // token even though the reader task is still alive. tokio::spawn(async move { - background_reader(read, state, cancel, writer).await; + background_reader(read, state, cancel, writer, delivery_overflow_len).await; }); Ok(Self { @@ -639,7 +667,7 @@ impl FilaClient { // Register the delivery channel BEFORE sending the Consume request so // early Delivery frames aren't dropped. Save the previous channel to // restore on failure (avoids breaking an existing active consumer). - let (delivery_tx, delivery_rx) = mpsc::channel(256); + let (delivery_tx, delivery_rx) = mpsc::channel(self.inner.delivery_buffer_size); let prev_delivery_tx; let prev_consume_request_id; { @@ -736,15 +764,11 @@ impl FilaClient { let queue = queue.to_string(); Box::pin(async move { let leader_opts = match self.connect_options { - Some(ref opts) => ConnectOptions { - addr: leader_addr.clone(), - timeout: opts.timeout, - tls: opts.tls, - tls_ca_cert_pem: opts.tls_ca_cert_pem.clone(), - tls_client_cert_pem: opts.tls_client_cert_pem.clone(), - tls_client_key_pem: opts.tls_client_key_pem.clone(), - api_key: opts.api_key.clone(), - }, + Some(ref opts) => { + let mut leader_opts = opts.clone(); + leader_opts.addr = leader_addr.clone(); + leader_opts + } None => ConnectOptions::new(&leader_addr), }; match Self::connect_with_options(leader_opts).await { @@ -847,6 +871,15 @@ impl FilaClient { Ok(()) } + + /// Returns the current number of messages in the delivery overflow buffer. + /// Only available with the `test-internals` feature. + #[cfg(feature = "test-internals")] + pub async fn delivery_internal_len(&self) -> usize { + self.inner + .delivery_overflow_len + .load(std::sync::atomic::Ordering::Relaxed) + } } /// Read a single frame from the stream. @@ -888,6 +921,7 @@ async fn background_reader( state: Arc>, cancel: CancellationToken, writer: Arc>, + overflow_len_counter: Arc, ) { let mut buf = BytesMut::with_capacity(4096); let mut delivery_overflow: std::collections::VecDeque> = @@ -903,21 +937,26 @@ async fn background_reader( if let Some(tx) = tx { while let Some(item) = delivery_overflow.pop_front() { match tx.try_send(item) { - Ok(_) => {} + Ok(_) => { + overflow_len_counter.fetch_sub(1, std::sync::atomic::Ordering::Relaxed); + } Err(mpsc::error::TrySendError::Full(item)) => { - // Put it back at the front and stop flushing. delivery_overflow.push_front(item); break; } Err(mpsc::error::TrySendError::Closed(_)) => { + let drained = delivery_overflow.len(); delivery_overflow.clear(); + overflow_len_counter + .fetch_sub(drained, std::sync::atomic::Ordering::Relaxed); break; } } } } else { - // No delivery channel — discard overflow. + let drained = delivery_overflow.len(); delivery_overflow.clear(); + overflow_len_counter.fetch_sub(drained, std::sync::atomic::Ordering::Relaxed); } } @@ -925,7 +964,13 @@ async fn background_reader( loop { match RawFrame::decode(&mut buf) { Ok(Some(frame)) => { - dispatch_frame_inline(frame, &state, &mut delivery_overflow).await; + dispatch_frame_inline( + frame, + &state, + &mut delivery_overflow, + &overflow_len_counter, + ) + .await; } Ok(None) => break, Err(e) => { @@ -975,6 +1020,7 @@ async fn dispatch_frame_inline( frame: RawFrame, state: &Mutex, delivery_overflow: &mut std::collections::VecDeque>, + overflow_len_counter: &std::sync::atomic::AtomicUsize, ) { let opcode = Opcode::from_u8(frame.opcode); let request_id = frame.request_id; @@ -1004,6 +1050,8 @@ async fn dispatch_frame_inline( Ok(_) => {} Err(mpsc::error::TrySendError::Full(item)) => { delivery_overflow.push_back(item); + overflow_len_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); } Err(mpsc::error::TrySendError::Closed(_)) => {} } @@ -1017,6 +1065,8 @@ async fn dispatch_frame_inline( Ok(_) => {} Err(mpsc::error::TrySendError::Full(item)) => { delivery_overflow.push_back(item); + overflow_len_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); } Err(mpsc::error::TrySendError::Closed(_)) => {} } diff --git a/crates/fila-sdk/src/error.rs b/crates/fila-sdk/src/error.rs index 888b96d8..0e9ff026 100644 --- a/crates/fila-sdk/src/error.rs +++ b/crates/fila-sdk/src/error.rs @@ -113,17 +113,17 @@ pub(crate) fn error_code_to_status(code: ErrorCode, message: String) -> StatusEr } } -pub(crate) fn enqueue_error_from_code(code: ErrorCode, message: String) -> EnqueueError { +pub(crate) fn consume_error_from_code(code: ErrorCode, message: String) -> ConsumeError { match code { - ErrorCode::QueueNotFound => EnqueueError::QueueNotFound(message), - _ => EnqueueError::Status(error_code_to_status(code, message)), + ErrorCode::QueueNotFound => ConsumeError::QueueNotFound(message), + _ => ConsumeError::Status(error_code_to_status(code, message)), } } -pub(crate) fn consume_error_from_code(code: ErrorCode, message: String) -> ConsumeError { +pub(crate) fn enqueue_error_from_code(code: ErrorCode, message: String) -> EnqueueError { match code { - ErrorCode::QueueNotFound => ConsumeError::QueueNotFound(message), - _ => ConsumeError::Status(error_code_to_status(code, message)), + ErrorCode::QueueNotFound => EnqueueError::QueueNotFound(message), + _ => EnqueueError::Status(error_code_to_status(code, message)), } } diff --git a/crates/fila-sdk/tests/backpressure.rs b/crates/fila-sdk/tests/backpressure.rs new file mode 100644 index 00000000..6b1ddcd4 --- /dev/null +++ b/crates/fila-sdk/tests/backpressure.rs @@ -0,0 +1,115 @@ +//! Validates SDK delivery buffering behavior under backpressure. +//! +//! When a consumer stops reading, the SDK buffers delivery frames in an +//! internal overflow VecDeque. The overflow is unbounded on the client side +//! (to prevent deadlocking response frames), but the server's delivery +//! channel (64 capacity) provides natural backpressure that limits how +//! fast messages are pushed. + +use std::collections::HashMap; +use std::io::{BufRead, BufReader}; +use std::net::TcpListener; +use std::path::PathBuf; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use fila_sdk::FilaClient; + +fn free_port() -> u16 { + TcpListener::bind("127.0.0.1:0") + .unwrap() + .local_addr() + .unwrap() + .port() +} + +fn workspace_binary(name: &str) -> PathBuf { + let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + p.pop(); + p.pop(); + p.push("target/debug"); + p.push(name); + p +} + +/// Prove that the delivery overflow buffer tracks messages correctly and +/// that the SDK remains functional (no deadlock) even when the consumer +/// stops reading. +#[tokio::test(flavor = "multi_thread")] +async fn stalled_consumer_buffers_without_deadlock() { + let port = free_port(); + let addr = format!("127.0.0.1:{port}"); + let data_dir = tempfile::tempdir().unwrap(); + + let config = + format!("[server]\nlisten_addr = \"{addr}\"\n\n[telemetry]\notlp_endpoint = \"\"\n"); + std::fs::write(data_dir.path().join("fila.toml"), config).unwrap(); + + let mut child = Command::new(workspace_binary("fila-server")) + .env( + "FILA_DATA_DIR", + data_dir.path().join("data").to_str().unwrap(), + ) + .current_dir(data_dir.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + let stdout = child.stdout.take().unwrap(); + std::thread::spawn(move || for _ in BufReader::new(stdout).lines() {}); + let stderr = child.stderr.take().unwrap(); + std::thread::spawn(move || for _ in BufReader::new(stderr).lines() {}); + + for _ in 0..50 { + if std::net::TcpStream::connect(&addr).is_ok() { + break; + } + std::thread::sleep(Duration::from_millis(50)); + } + + // Create queue + let out = Command::new(workspace_binary("fila")) + .args(["--addr", &addr, "queue", "create", "backpressure-q"]) + .output() + .unwrap(); + assert!(out.status.success()); + + // Enqueue 1000 messages on a separate connection + let producer = FilaClient::connect(&addr).await.unwrap(); + for _ in 0..1000 { + let _ = producer + .enqueue("backpressure-q", HashMap::new(), b"x".to_vec()) + .await; + } + + // Connect consumer with a tiny delivery channel. + let consumer = FilaClient::connect_with_options( + fila_sdk::ConnectOptions::new(&addr).with_delivery_buffer_size(2), + ) + .await + .unwrap(); + + let _stream = consumer.consume("backpressure-q").await.unwrap(); + + // Don't read from the stream — let the server deliver and the overflow grow. + tokio::time::sleep(Duration::from_secs(2)).await; + + // The overflow should have messages (server pushed them). + let overflow_len = consumer.delivery_internal_len().await; + eprintln!("overflow buffer size with stalled consumer: {overflow_len}"); + + // Key assertion: the SDK didn't deadlock. We can still do operations + // on the SAME connection that has the stalled consumer. + let enqueue_result = tokio::time::timeout( + Duration::from_secs(3), + consumer.enqueue("backpressure-q", HashMap::new(), b"after-stall".to_vec()), + ) + .await; + assert!( + enqueue_result.is_ok(), + "enqueue on stalled consumer connection should succeed (no deadlock)" + ); + + drop(_stream); + child.kill().unwrap(); +} diff --git a/crates/fila-sdk/tests/integration.rs b/crates/fila-sdk/tests/integration.rs index f6ce758b..7d58fa48 100644 --- a/crates/fila-sdk/tests/integration.rs +++ b/crates/fila-sdk/tests/integration.rs @@ -27,29 +27,21 @@ fn workspace_binary(name: &str) -> PathBuf { struct TestServer { child: Option, - /// gRPC address (http://host:port) for CLI commands. - grpc_addr: String, - /// Binary protocol address (host:port) for SDK connections. - binary_addr: String, + /// Address (host:port) for both CLI and SDK connections. + addr: String, _data_dir: tempfile::TempDir, } impl TestServer { fn start() -> Self { - let grpc_port = free_port(); - let mut binary_port = free_port(); - while binary_port == grpc_port { - binary_port = free_port(); - } - let grpc_addr_raw = format!("127.0.0.1:{grpc_port}"); - let binary_addr = format!("127.0.0.1:{binary_port}"); + let port = free_port(); + let addr = format!("127.0.0.1:{port}"); let data_dir = tempfile::tempdir().expect("create temp dir"); let config_path = data_dir.path().join("fila.toml"); let config_content = format!( r#"[server] -listen_addr = "{grpc_addr_raw}" -binary_addr = "{binary_addr}" +listen_addr = "{addr}" [telemetry] otlp_endpoint = "" @@ -80,43 +72,29 @@ otlp_endpoint = "" let stderr = child.stderr.take().expect("stderr"); std::thread::spawn(move || for _ in BufReader::new(stderr).lines() {}); - // Poll until both ports are reachable. + // Poll until the port is reachable. let start = std::time::Instant::now(); - let mut grpc_ok = false; - let mut binary_ok = false; while start.elapsed() < Duration::from_secs(10) { - if !grpc_ok && std::net::TcpStream::connect(&grpc_addr_raw).is_ok() { - grpc_ok = true; - } - if !binary_ok && std::net::TcpStream::connect(&binary_addr).is_ok() { - binary_ok = true; - } - if grpc_ok && binary_ok { + if std::net::TcpStream::connect(&addr).is_ok() { break; } std::thread::sleep(Duration::from_millis(50)); } assert!( - grpc_ok && binary_ok, + std::net::TcpStream::connect(&addr).is_ok(), "fila-server did not become reachable within 10s" ); Self { child: Some(child), - grpc_addr: format!("http://{grpc_addr_raw}"), - binary_addr, + addr, _data_dir: data_dir, } } - /// gRPC address for CLI commands. - fn grpc_addr(&self) -> &str { - &self.grpc_addr - } - - /// Binary protocol address for SDK connections. - fn binary_addr(&self) -> &str { - &self.binary_addr + /// Address for CLI and SDK connections. + fn addr(&self) -> &str { + &self.addr } } @@ -152,9 +130,9 @@ fn create_queue_cli(addr: &str, name: &str) { #[tokio::test] async fn enqueue_consume_ack_lifecycle() { let server = TestServer::start(); - create_queue_cli(server.binary_addr(), "test-lifecycle"); + create_queue_cli(server.addr(), "test-lifecycle"); - let client = FilaClient::connect(server.binary_addr()).await.unwrap(); + let client = FilaClient::connect(server.addr()).await.unwrap(); let queue = "test-lifecycle"; // Enqueue a message @@ -193,9 +171,9 @@ async fn enqueue_consume_ack_lifecycle() { #[tokio::test] async fn enqueue_consume_nack_release() { let server = TestServer::start(); - create_queue_cli(server.binary_addr(), "test-nack"); + create_queue_cli(server.addr(), "test-nack"); - let client = FilaClient::connect(server.binary_addr()).await.unwrap(); + let client = FilaClient::connect(server.addr()).await.unwrap(); let queue = "test-nack"; // Enqueue @@ -235,7 +213,7 @@ async fn enqueue_consume_nack_release() { #[tokio::test] async fn enqueue_to_nonexistent_queue() { let server = TestServer::start(); - let client = FilaClient::connect(server.binary_addr()).await.unwrap(); + let client = FilaClient::connect(server.addr()).await.unwrap(); let err = client .enqueue("no-such-queue", HashMap::new(), b"data".to_vec()) @@ -257,7 +235,7 @@ async fn double_consume_does_not_hang() { let cli = workspace_binary("fila"); let output: Output = Command::new(&cli) .arg("--addr") - .arg(server.binary_addr()) + .arg(server.addr()) .args([ "queue", "create", @@ -274,7 +252,7 @@ async fn double_consume_does_not_hang() { ); } - let client = FilaClient::connect(server.binary_addr()).await.unwrap(); + let client = FilaClient::connect(server.addr()).await.unwrap(); // Enqueue messages for i in 0..3 { @@ -320,14 +298,14 @@ async fn double_consume_does_not_hang() { #[tokio::test] async fn client_drop_sends_disconnect_and_cleans_up() { let server = TestServer::start(); - create_queue_cli(server.binary_addr(), "disconnect-queue"); + create_queue_cli(server.addr(), "disconnect-queue"); // Connect and start consuming so the server registers an active consumer. - let client = FilaClient::connect(server.binary_addr()).await.unwrap(); + let client = FilaClient::connect(server.addr()).await.unwrap(); let _stream = client.consume("disconnect-queue").await.unwrap(); // Use CLI to verify the server sees an active consumer. - let consumers = parse_active_consumers(&cli_inspect(server.binary_addr(), "disconnect-queue")); + let consumers = parse_active_consumers(&cli_inspect(server.addr(), "disconnect-queue")); assert_eq!(consumers, 1, "should have 1 consumer before drop"); // Drop the consuming client — this should send a Disconnect frame @@ -338,8 +316,7 @@ async fn client_drop_sends_disconnect_and_cleans_up() { // Poll until the server processes the disconnect (with timeout). let start = std::time::Instant::now(); loop { - let consumers = - parse_active_consumers(&cli_inspect(server.binary_addr(), "disconnect-queue")); + let consumers = parse_active_consumers(&cli_inspect(server.addr(), "disconnect-queue")); if consumers == 0 { break; } diff --git a/crates/fila-server/Cargo.toml b/crates/fila-server/Cargo.toml index 93353917..bfebfd37 100644 --- a/crates/fila-server/Cargo.toml +++ b/crates/fila-server/Cargo.toml @@ -5,27 +5,17 @@ edition.workspace = true license.workspace = true repository.workspace = true rust-version.workspace = true -description = "gRPC server binary for Fila" +description = "Binary protocol server for Fila" publish = false [dependencies] -fila-proto.workspace = true fila-core.workspace = true fila-fibp.workspace = true -tonic.workspace = true -tonic-prost.workspace = true tokio.workspace = true -tokio-stream = "0.1" tracing.workspace = true -tracing-opentelemetry.workspace = true -opentelemetry.workspace = true toml.workspace = true uuid.workspace = true bytes.workspace = true -tower.workspace = true -http.workspace = true -pin-project-lite.workspace = true -futures-core = "0.3" tokio-rustls = "0.26" rustls = "0.23" rustls-pemfile = "2" diff --git a/crates/fila-server/src/admin_service.rs b/crates/fila-server/src/admin_service.rs deleted file mode 100644 index 2ea0110f..00000000 --- a/crates/fila-server/src/admin_service.rs +++ /dev/null @@ -1,1207 +0,0 @@ -use std::sync::Arc; - -use fila_core::{ - Broker, ClusterHandle, ClusterRequest, ClusterResponse, ClusterWriteError, QueueConfig, - SchedulerCommand, -}; -use fila_proto::fila_admin_server::FilaAdmin; -use fila_proto::{ - AclPermission, ApiKeyInfo, ConfigEntry, CreateApiKeyRequest, CreateApiKeyResponse, - CreateQueueRequest, CreateQueueResponse, DeleteQueueRequest, DeleteQueueResponse, - GetAclRequest, GetAclResponse, GetConfigRequest, GetConfigResponse, GetStatsRequest, - GetStatsResponse, ListApiKeysRequest, ListApiKeysResponse, ListConfigRequest, - ListConfigResponse, ListQueuesRequest, ListQueuesResponse, PerFairnessKeyStats, - PerThrottleKeyStats, QueueInfo, RedriveRequest, RedriveResponse, RevokeApiKeyRequest, - RevokeApiKeyResponse, SetAclRequest, SetAclResponse, SetConfigRequest, SetConfigResponse, -}; -use tonic::{Request, Response, Status}; -use tracing::instrument; - -use crate::auth::ValidatedKeyId; -use crate::error::IntoStatus; - -/// Map cluster write errors to appropriate gRPC status codes. -fn cluster_write_err_to_status(err: ClusterWriteError) -> Status { - match err { - ClusterWriteError::QueueGroupNotFound => Status::not_found("queue raft group not found"), - ClusterWriteError::NodeNotReady => { - Status::unavailable("node not ready for this queue — retry on another node or wait") - } - ClusterWriteError::NoLeader => Status::unavailable("no leader available"), - ClusterWriteError::Raft(e) => Status::internal(format!("raft error: {e}")), - ClusterWriteError::Forward(e) => Status::unavailable(format!("forward error: {e}")), - } -} - -/// Count current queue leaderships per node across all queue Raft groups. -async fn count_leaderships( - cluster: &ClusterHandle, - candidates: &[u64], -) -> std::collections::HashMap { - let groups = cluster.multi_raft.snapshot_groups().await; - - let mut counts: std::collections::HashMap = - candidates.iter().map(|&id| (id, 0)).collect(); - - for (_queue_id, raft) in &groups { - if let Some(leader) = raft.current_leader().await { - if let Some(count) = counts.get_mut(&leader) { - *count += 1; - } - } - } - - counts -} - -/// Select which nodes should participate in a new queue's Raft group, -/// and which node should be the preferred leader. -/// -/// When the cluster has more nodes than `replication_factor`, only the -/// N least-loaded nodes are selected. The preferred leader is the node -/// with the fewest leaderships among the selected members. -async fn select_members_and_leader( - all_members: &[u64], - cluster: &ClusterHandle, -) -> (Vec, u64) { - let rf = cluster.replication_factor; - let counts = count_leaderships(cluster, all_members).await; - - // Select subset if cluster is larger than replication factor. - let selected: Vec = if all_members.len() > rf && rf > 0 { - let mut sorted: Vec = all_members.to_vec(); - sorted.sort_by_key(|&id| { - let count = counts.get(&id).copied().unwrap_or(0); - (count, id) - }); - sorted.truncate(rf); - sorted - } else { - all_members.to_vec() - }; - - // Preferred leader: least-loaded among selected (tie-break: lowest ID). - let preferred = selected - .iter() - .copied() - .min_by_key(|&id| { - let count = counts.get(&id).copied().unwrap_or(0); - (count, id) - }) - .unwrap_or_else(|| selected.first().copied().unwrap_or(1)); - - (selected, preferred) -} - -/// gRPC admin service implementation. Wraps a Broker to send commands -/// to the scheduler thread. -pub struct AdminService { - broker: Arc, - cluster: Option>, -} - -impl AdminService { - const MAX_CONFIG_KEY_LEN: usize = 256; - const MAX_CONFIG_VALUE_LEN: usize = 1024; - - pub fn new(broker: Arc, cluster: Option>) -> Self { - Self { broker, cluster } - } - - /// Check that the request's validated key has `admin:*` permission (or is superadmin). - /// - /// Used for global operations (config, stats, revoke, list) that are not - /// scoped to a specific queue or permission set. - /// - /// Returns `Ok(())` when auth is disabled or the key has global admin access. - fn check_admin(&self, request: &tonic::Request) -> Result<(), Status> { - let caller = match request.extensions().get::() { - Some(k) => &k.0, - None => return Ok(()), // auth disabled - }; - let permitted = self - .broker - .check_permission(caller, fila_core::broker::auth::Permission::Admin, "*") - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if permitted { - Ok(()) - } else { - Err(Status::permission_denied( - "key does not have admin permission", - )) - } - } - - /// Check that the request's validated key has *any* admin permission. - /// - /// Used for scoped operations (create_queue, delete_queue, create_api_key, - /// set_acl) where a per-queue admin key (`admin:orders`) is sufficient. - /// The caller is returned so the handler can perform additional scope checks. - /// - /// Returns `Ok(None)` when auth is disabled (all operations permitted). - fn require_any_admin( - &self, - request: &tonic::Request, - ) -> Result, Status> { - let caller = match request.extensions().get::() { - Some(k) => k.0.clone(), - None => return Ok(None), // auth disabled - }; - let ok = self - .broker - .has_any_admin(&caller) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if ok { - Ok(Some(caller)) - } else { - Err(Status::permission_denied( - "key does not have admin permission", - )) - } - } -} - -#[tonic::async_trait] -impl FilaAdmin for AdminService { - #[instrument(skip(self), fields(queue_id))] - async fn create_queue( - &self, - request: Request, - ) -> Result, Status> { - let caller = self.require_any_admin(&request)?; - let req = request.into_inner(); - - if req.name.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // Scope check: caller must have admin permission covering this queue name. - if let Some(ref caller) = caller { - let ok = self - .broker - .caller_has_queue_admin(caller, &req.name) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !ok { - return Err(Status::permission_denied(format!( - "key does not have admin permission on queue \"{}\"", - req.name - ))); - } - } - tracing::Span::current().record("queue_id", req.name.as_str()); - - let proto_config = req.config.unwrap_or_default(); - - let visibility_timeout_ms = match proto_config.visibility_timeout_ms { - 0 => QueueConfig::DEFAULT_VISIBILITY_TIMEOUT_MS, // proto3: 0 means unset - v if v < 1_000 => { - return Err(Status::invalid_argument( - "visibility_timeout_ms must be at least 1000 (1 second)", - )); - } - v => v, - }; - - let config = QueueConfig { - name: req.name.clone(), - on_enqueue_script: if proto_config.on_enqueue_script.is_empty() { - None - } else { - Some(proto_config.on_enqueue_script) - }, - on_failure_script: if proto_config.on_failure_script.is_empty() { - None - } else { - Some(proto_config.on_failure_script) - }, - visibility_timeout_ms, - dlq_queue_id: None, - lua_timeout_ms: None, - lua_memory_limit_bytes: None, - }; - - if let Some(ref cluster) = self.cluster { - // Cluster mode: submit CreateQueueGroup to meta Raft. - // The meta state machine event handler will create the queue - // in the local scheduler and start the queue's Raft group on - // all nodes. - let (all_members, _member_addrs) = cluster.meta_members(); - - // Select which nodes participate and the preferred leader. - // When the cluster is larger than replication_factor, only a - // subset of least-loaded nodes are selected. - let (members, preferred_leader) = - select_members_and_leader(&all_members, cluster.as_ref()).await; - - let resp = cluster - .write_to_meta(ClusterRequest::CreateQueueGroup { - queue_id: req.name.clone(), - members, - config, - preferred_leader, - }) - .await - .map_err(cluster_write_err_to_status)?; - - match resp { - ClusterResponse::CreateQueueGroup { queue_id } => { - Ok(Response::new(CreateQueueResponse { queue_id })) - } - ClusterResponse::Error { message } => Err(Status::internal(message)), - _ => Err(Status::internal("unexpected cluster response")), - } - } else { - // Single-node mode: direct to scheduler. - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::CreateQueue { - name: req.name, - config, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let queue_id = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(CreateQueueResponse { queue_id })) - } - } - - #[instrument(skip(self), fields(queue_id))] - async fn delete_queue( - &self, - request: Request, - ) -> Result, Status> { - let caller = self.require_any_admin(&request)?; - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // Scope check: caller must have admin permission covering this queue. - if let Some(ref caller) = caller { - let ok = self - .broker - .caller_has_queue_admin(caller, &req.queue) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !ok { - return Err(Status::permission_denied(format!( - "key does not have admin permission on queue \"{}\"", - req.queue - ))); - } - } - tracing::Span::current().record("queue_id", req.queue.as_str()); - - if let Some(ref cluster) = self.cluster { - // Cluster mode: submit DeleteQueueGroup to meta Raft. - let resp = cluster - .write_to_meta(ClusterRequest::DeleteQueueGroup { - queue_id: req.queue, - }) - .await - .map_err(cluster_write_err_to_status)?; - - match resp { - ClusterResponse::DeleteQueueGroup => Ok(Response::new(DeleteQueueResponse {})), - ClusterResponse::Error { message } => Err(Status::internal(message)), - _ => Err(Status::internal("unexpected cluster response")), - } - } else { - // Single-node mode: direct to scheduler. - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::DeleteQueue { - queue_id: req.queue, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(DeleteQueueResponse {})) - } - } - - #[instrument(skip(self))] - async fn set_config( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - - if req.key.is_empty() { - return Err(Status::invalid_argument("config key must not be empty")); - } - if req.key.len() > AdminService::MAX_CONFIG_KEY_LEN { - return Err(Status::invalid_argument(format!( - "config key must not exceed {} bytes", - AdminService::MAX_CONFIG_KEY_LEN - ))); - } - if req.value.len() > AdminService::MAX_CONFIG_VALUE_LEN { - return Err(Status::invalid_argument(format!( - "config value must not exceed {} bytes", - AdminService::MAX_CONFIG_VALUE_LEN - ))); - } - - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::SetConfig { - key: req.key, - value: req.value, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(SetConfigResponse {})) - } - - #[instrument(skip(self))] - async fn get_config( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - - if req.key.is_empty() { - return Err(Status::invalid_argument("config key must not be empty")); - } - if req.key.len() > AdminService::MAX_CONFIG_KEY_LEN { - return Err(Status::invalid_argument(format!( - "config key must not exceed {} bytes", - AdminService::MAX_CONFIG_KEY_LEN - ))); - } - - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::GetConfig { - key: req.key, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let value = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(GetConfigResponse { - value: value.unwrap_or_default(), - })) - } - - #[instrument(skip(self))] - async fn list_config( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - - if req.prefix.len() > AdminService::MAX_CONFIG_KEY_LEN { - return Err(Status::invalid_argument(format!( - "config prefix must not exceed {} bytes", - AdminService::MAX_CONFIG_KEY_LEN - ))); - } - - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::ListConfig { - prefix: req.prefix, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let entries = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - let total_count = u32::try_from(entries.len()).unwrap_or(u32::MAX); - let config_entries = entries - .into_iter() - .map(|(key, value)| ConfigEntry { key, value }) - .collect(); - - Ok(Response::new(ListConfigResponse { - entries: config_entries, - total_count, - })) - } - - #[instrument(skip(self))] - async fn get_stats( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - let queue_name = req.queue; - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::GetStats { - queue_id: queue_name.clone(), - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let stats = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - let per_key_stats = stats - .per_key_stats - .into_iter() - .map(|s| PerFairnessKeyStats { - key: s.key, - pending_count: s.pending_count, - current_deficit: s.current_deficit, - weight: s.weight, - }) - .collect(); - - let per_throttle_stats = stats - .per_throttle_stats - .into_iter() - .map(|s| PerThrottleKeyStats { - key: s.key, - tokens: s.tokens, - rate_per_second: s.rate_per_second, - burst: s.burst, - }) - .collect(); - - // Enrich with cluster info if in cluster mode. - let (leader_node_id, replication_count) = if let Some(cluster) = &self.cluster { - let leader = cluster - .multi_raft - .get_raft(&queue_name) - .await - .map(|raft| { - let metrics = raft.metrics().borrow().clone(); - let leader_id = metrics.current_leader.unwrap_or(0); - let voters = metrics.membership_config.membership().voter_ids().count() as u32; - (leader_id, voters) - }) - .unwrap_or((0, 0)); - leader - } else { - (0, 0) - }; - - Ok(Response::new(GetStatsResponse { - depth: stats.depth, - in_flight: stats.in_flight, - active_fairness_keys: stats.active_fairness_keys, - active_consumers: stats.active_consumers, - quantum: stats.quantum, - per_key_stats, - per_throttle_stats, - leader_node_id, - replication_count, - })) - } - - #[instrument(skip(self))] - async fn redrive( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - - if req.dlq_queue.is_empty() { - return Err(Status::invalid_argument("dlq_queue name must not be empty")); - } - - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Redrive { - dlq_queue_id: req.dlq_queue, - count: req.count, - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let redriven = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(RedriveResponse { redriven })) - } - - #[instrument(skip(self))] - async fn list_queues( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::ListQueues { reply: reply_tx }) - .map_err(IntoStatus::into_status)?; - - let summaries = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))? - .map_err(IntoStatus::into_status)?; - - let mut queues = Vec::with_capacity(summaries.len()); - for s in summaries { - let leader_node_id = if let Some(cluster) = &self.cluster { - cluster - .multi_raft - .get_raft(&s.name) - .await - .map(|raft| raft.metrics().borrow().current_leader.unwrap_or(0)) - .unwrap_or(0) - } else { - 0 - }; - queues.push(QueueInfo { - name: s.name, - depth: s.depth, - in_flight: s.in_flight, - active_consumers: s.active_consumers, - leader_node_id, - }); - } - - let cluster_node_count = self - .cluster - .as_ref() - .map(|c| { - let (members, _) = c.meta_members(); - members.len() as u32 - }) - .unwrap_or(0); - - Ok(Response::new(ListQueuesResponse { - queues, - cluster_node_count, - })) - } - - #[instrument(skip(self))] - async fn create_api_key( - &self, - request: Request, - ) -> Result, Status> { - let caller = self.require_any_admin(&request)?; - let req = request.into_inner(); - if req.name.is_empty() { - return Err(Status::invalid_argument("name must not be empty")); - } - // Only superadmins can create other superadmin keys — admins cannot - // elevate a new key beyond their own privilege level. - if req.is_superadmin { - if let Some(ref caller) = caller { - let ok = self - .broker - .is_superadmin(caller) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !ok { - return Err(Status::permission_denied( - "only superadmin keys can create other superadmin keys", - )); - } - } - } - let expires_at = if req.expires_at_ms == 0 { - None - } else { - Some(req.expires_at_ms) - }; - let (key_id, token) = self - .broker - .create_api_key(&req.name, expires_at, req.is_superadmin) - .map_err(|e| Status::internal(format!("storage error: {e}")))?; - Ok(Response::new(CreateApiKeyResponse { - key_id, - key: token, - is_superadmin: req.is_superadmin, - })) - } - - #[instrument(skip(self))] - async fn revoke_api_key( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - if req.key_id.is_empty() { - return Err(Status::invalid_argument("key_id must not be empty")); - } - let found = self - .broker - .revoke_api_key(&req.key_id) - .map_err(|e| Status::internal(format!("storage error: {e}")))?; - if found { - Ok(Response::new(RevokeApiKeyResponse {})) - } else { - Err(Status::not_found("api key not found")) - } - } - - #[instrument(skip(self))] - async fn list_api_keys( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let entries = self - .broker - .list_api_keys() - .map_err(|e| Status::internal(format!("storage error: {e}")))?; - let keys = entries - .into_iter() - .map(|e| ApiKeyInfo { - key_id: e.key_id, - name: e.name, - created_at_ms: e.created_at_ms, - expires_at_ms: e.expires_at_ms.unwrap_or(0), - is_superadmin: e.is_superadmin, - }) - .collect(); - Ok(Response::new(ListApiKeysResponse { keys })) - } - - #[instrument(skip(self))] - async fn set_acl( - &self, - request: Request, - ) -> Result, Status> { - let caller = self.require_any_admin(&request)?; - let req = request.into_inner(); - if req.key_id.is_empty() { - return Err(Status::invalid_argument("key_id must not be empty")); - } - const VALID_KINDS: &[&str] = &["produce", "consume", "admin"]; - for p in &req.permissions { - if !VALID_KINDS.contains(&p.kind.as_str()) { - return Err(Status::invalid_argument(format!( - "invalid permission kind \"{}\"; must be one of: produce, consume, admin", - p.kind - ))); - } - } - // Delegation scope check: caller can only grant permissions within their own admin scope. - if let Some(ref caller) = caller { - let perms: Vec<(String, String)> = req - .permissions - .iter() - .map(|p| (p.kind.clone(), p.pattern.clone())) - .collect(); - let ok = self - .broker - .caller_can_grant_all(caller, &perms) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !ok { - return Err(Status::permission_denied( - "one or more permissions exceed the caller's admin scope", - )); - } - } - let permissions = req - .permissions - .into_iter() - .map(|p| (p.kind, p.pattern)) - .collect(); - let found = self - .broker - .set_acl(&req.key_id, permissions) - .map_err(|e| Status::internal(format!("storage error: {e}")))?; - if found { - Ok(Response::new(SetAclResponse {})) - } else { - Err(Status::not_found("api key not found")) - } - } - - #[instrument(skip(self))] - async fn get_acl( - &self, - request: Request, - ) -> Result, Status> { - self.check_admin(&request)?; - let req = request.into_inner(); - if req.key_id.is_empty() { - return Err(Status::invalid_argument("key_id must not be empty")); - } - match self - .broker - .get_acl(&req.key_id) - .map_err(|e| Status::internal(format!("storage error: {e}")))? - { - Some(entry) => Ok(Response::new(GetAclResponse { - key_id: entry.key_id, - permissions: entry - .permissions - .into_iter() - .map(|(kind, pattern)| AclPermission { kind, pattern }) - .collect(), - is_superadmin: entry.is_superadmin, - })), - None => Err(Status::not_found("api key not found")), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use fila_core::{Broker, BrokerConfig, RocksDbEngine}; - use fila_proto::QueueConfig as ProtoQueueConfig; - - fn test_admin_service() -> (AdminService, tempfile::TempDir) { - let dir = tempfile::tempdir().unwrap(); - let storage = Arc::new(RocksDbEngine::open(dir.path()).unwrap()); - let broker = Arc::new(Broker::new(BrokerConfig::default(), storage).unwrap()); - (AdminService::new(broker, None), dir) - } - - #[tokio::test] - async fn create_queue_rejects_invalid_visibility_timeout() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(CreateQueueRequest { - name: "test-queue".to_string(), - config: Some(ProtoQueueConfig { - visibility_timeout_ms: 500, // too low - ..Default::default() - }), - }); - - let err = svc.create_queue(request).await.unwrap_err(); - assert_eq!(err.code(), tonic::Code::InvalidArgument); - assert!(err.message().contains("visibility_timeout_ms")); - } - - #[tokio::test] - async fn create_queue_uses_default_when_timeout_is_zero() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(CreateQueueRequest { - name: "test-queue".to_string(), - config: Some(ProtoQueueConfig { - visibility_timeout_ms: 0, // proto3 unset - ..Default::default() - }), - }); - - let resp = svc.create_queue(request).await.unwrap(); - assert_eq!(resp.into_inner().queue_id, "test-queue"); - } - - #[tokio::test] - async fn create_queue_accepts_valid_visibility_timeout() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(CreateQueueRequest { - name: "test-queue".to_string(), - config: Some(ProtoQueueConfig { - visibility_timeout_ms: 60_000, - ..Default::default() - }), - }); - - let resp = svc.create_queue(request).await.unwrap(); - assert_eq!(resp.into_inner().queue_id, "test-queue"); - } - - #[tokio::test] - async fn set_config_with_valid_throttle_key() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(SetConfigRequest { - key: "throttle.provider_a".to_string(), - value: "10.0,100.0".to_string(), - }); - - svc.set_config(request).await.unwrap(); - - // Verify via get_config - let request = Request::new(GetConfigRequest { - key: "throttle.provider_a".to_string(), - }); - let resp = svc.get_config(request).await.unwrap(); - assert_eq!(resp.into_inner().value, "10.0,100.0"); - } - - #[tokio::test] - async fn set_config_empty_key_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(SetConfigRequest { - key: String::new(), - value: "10.0,100.0".to_string(), - }); - - let err = svc.set_config(request).await.unwrap_err(); - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn get_config_empty_key_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(GetConfigRequest { key: String::new() }); - - let err = svc.get_config(request).await.unwrap_err(); - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn get_config_missing_key_returns_empty_value() { - let (svc, _dir) = test_admin_service(); - - let request = Request::new(GetConfigRequest { - key: "nonexistent".to_string(), - }); - - let resp = svc.get_config(request).await.unwrap(); - assert_eq!(resp.into_inner().value, ""); - } - - #[tokio::test] - async fn list_config_returns_all_entries() { - let (svc, _dir) = test_admin_service(); - - // Set a few config entries - for (key, value) in &[("throttle.provider_a", "10.0,100.0"), ("app.flag", "on")] { - svc.set_config(Request::new(SetConfigRequest { - key: key.to_string(), - value: value.to_string(), - })) - .await - .unwrap(); - } - - let resp = svc - .list_config(Request::new(ListConfigRequest { - prefix: String::new(), - })) - .await - .unwrap() - .into_inner(); - - assert_eq!(resp.total_count, 2); - assert_eq!(resp.entries.len(), 2); - // RocksDB lexicographic order: app.flag < throttle.provider_a - assert_eq!(resp.entries[0].key, "app.flag"); - assert_eq!(resp.entries[0].value, "on"); - assert_eq!(resp.entries[1].key, "throttle.provider_a"); - assert_eq!(resp.entries[1].value, "10.0,100.0"); - } - - #[tokio::test] - async fn list_config_filters_by_prefix() { - let (svc, _dir) = test_admin_service(); - - for (key, value) in &[ - ("throttle.a", "1.0,10.0"), - ("throttle.b", "2.0,20.0"), - ("app.flag", "on"), - ] { - svc.set_config(Request::new(SetConfigRequest { - key: key.to_string(), - value: value.to_string(), - })) - .await - .unwrap(); - } - - let resp = svc - .list_config(Request::new(ListConfigRequest { - prefix: "throttle.".to_string(), - })) - .await - .unwrap() - .into_inner(); - - assert_eq!(resp.total_count, 2); - assert_eq!(resp.entries[0].key, "throttle.a"); - assert_eq!(resp.entries[0].value, "1.0,10.0"); - assert_eq!(resp.entries[1].key, "throttle.b"); - assert_eq!(resp.entries[1].value, "2.0,20.0"); - } - - #[tokio::test] - async fn list_config_empty_result_is_not_error() { - let (svc, _dir) = test_admin_service(); - - let resp = svc - .list_config(Request::new(ListConfigRequest { - prefix: "nonexistent.".to_string(), - })) - .await - .unwrap() - .into_inner(); - - assert_eq!(resp.total_count, 0); - assert!(resp.entries.is_empty()); - } - - #[tokio::test] - async fn list_config_oversized_prefix_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - let long_prefix = "x".repeat(AdminService::MAX_CONFIG_KEY_LEN + 1); - let err = svc - .list_config(Request::new(ListConfigRequest { - prefix: long_prefix, - })) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn get_stats_returns_populated_response() { - let (svc, _dir) = test_admin_service(); - - // Create a queue first - svc.create_queue(Request::new(CreateQueueRequest { - name: "stats-q".to_string(), - config: None, - })) - .await - .unwrap(); - - let resp = svc - .get_stats(Request::new(GetStatsRequest { - queue: "stats-q".to_string(), - })) - .await - .unwrap() - .into_inner(); - - assert_eq!(resp.depth, 0); - assert_eq!(resp.in_flight, 0); - assert_eq!(resp.active_fairness_keys, 0); - assert_eq!(resp.active_consumers, 0); - } - - #[tokio::test] - async fn get_stats_empty_queue_id_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - let err = svc - .get_stats(Request::new(GetStatsRequest { - queue: String::new(), - })) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn get_stats_nonexistent_queue_returns_not_found() { - let (svc, _dir) = test_admin_service(); - - let err = svc - .get_stats(Request::new(GetStatsRequest { - queue: "nonexistent".to_string(), - })) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::NotFound); - } - - #[tokio::test] - async fn redrive_returns_redriven_count() { - let (svc, _dir) = test_admin_service(); - - // Create a DLQ-named queue and its parent - svc.create_queue(Request::new(CreateQueueRequest { - name: "redrive-test".to_string(), - config: None, - })) - .await - .unwrap(); - - // Redrive from the auto-created DLQ (which is empty) - let resp = svc - .redrive(Request::new(RedriveRequest { - dlq_queue: "redrive-test.dlq".to_string(), - count: 0, - })) - .await - .unwrap() - .into_inner(); - - assert_eq!(resp.redriven, 0); - } - - #[tokio::test] - async fn redrive_empty_dlq_queue_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - let err = svc - .redrive(Request::new(RedriveRequest { - dlq_queue: String::new(), - count: 0, - })) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } - - #[tokio::test] - async fn list_queues_returns_created_queues() { - let (svc, _dir) = test_admin_service(); - - // No queues initially - let resp = svc - .list_queues(Request::new(ListQueuesRequest {})) - .await - .unwrap() - .into_inner(); - assert!(resp.queues.is_empty()); - - // Create two queues - svc.create_queue(Request::new(CreateQueueRequest { - name: "alpha".to_string(), - config: None, - })) - .await - .unwrap(); - svc.create_queue(Request::new(CreateQueueRequest { - name: "beta".to_string(), - config: None, - })) - .await - .unwrap(); - - let resp = svc - .list_queues(Request::new(ListQueuesRequest {})) - .await - .unwrap() - .into_inner(); - - // Should have 4 queues: alpha, alpha.dlq, beta, beta.dlq - let names: Vec<&str> = resp.queues.iter().map(|q| q.name.as_str()).collect(); - assert!(names.contains(&"alpha")); - assert!(names.contains(&"alpha.dlq")); - assert!(names.contains(&"beta")); - assert!(names.contains(&"beta.dlq")); - assert_eq!(resp.queues.len(), 4); - - // All should have zero depth/in_flight/consumers - for q in &resp.queues { - assert_eq!(q.depth, 0); - assert_eq!(q.in_flight, 0); - assert_eq!(q.active_consumers, 0); - } - } - - #[tokio::test] - async fn get_stats_single_node_returns_zero_cluster_fields() { - let (svc, _dir) = test_admin_service(); - - svc.create_queue(Request::new(CreateQueueRequest { - name: "single-q".to_string(), - config: None, - })) - .await - .unwrap(); - - let resp = svc - .get_stats(Request::new(GetStatsRequest { - queue: "single-q".to_string(), - })) - .await - .unwrap() - .into_inner(); - - assert_eq!( - resp.leader_node_id, 0, - "single-node should have leader_node_id=0" - ); - assert_eq!( - resp.replication_count, 0, - "single-node should have replication_count=0" - ); - } - - #[tokio::test] - async fn list_queues_single_node_returns_zero_cluster_fields() { - let (svc, _dir) = test_admin_service(); - - svc.create_queue(Request::new(CreateQueueRequest { - name: "single-q".to_string(), - config: None, - })) - .await - .unwrap(); - - let resp = svc - .list_queues(Request::new(ListQueuesRequest {})) - .await - .unwrap() - .into_inner(); - - assert_eq!( - resp.cluster_node_count, 0, - "single-node should have cluster_node_count=0" - ); - for q in &resp.queues { - assert_eq!( - q.leader_node_id, 0, - "single-node queue should have leader_node_id=0" - ); - } - } - - #[tokio::test] - async fn redrive_non_dlq_queue_returns_invalid_argument() { - let (svc, _dir) = test_admin_service(); - - svc.create_queue(Request::new(CreateQueueRequest { - name: "not-a-dlq".to_string(), - config: None, - })) - .await - .unwrap(); - - let err = svc - .redrive(Request::new(RedriveRequest { - dlq_queue: "not-a-dlq".to_string(), - count: 0, - })) - .await - .unwrap_err(); - - assert_eq!(err.code(), tonic::Code::InvalidArgument); - } -} diff --git a/crates/fila-server/src/auth.rs b/crates/fila-server/src/auth.rs deleted file mode 100644 index 5c647a32..00000000 --- a/crates/fila-server/src/auth.rs +++ /dev/null @@ -1,143 +0,0 @@ -//! Tower layer that enforces API key authentication on incoming gRPC requests. -//! -//! When `broker.auth_enabled` is `true`, every incoming RPC must include an -//! `authorization: Bearer ` metadata header. Requests without a valid key -//! receive `UNAUTHENTICATED`. When auth is disabled (the default), all requests -//! pass through unconditionally. -//! -//! No RPC bypasses authentication. To provision the first API key, configure -//! a `bootstrap_apikey` in `fila.toml` or set `FILA_BOOTSTRAP_APIKEY`. The -//! bootstrap key is accepted as a valid credential by `validate_api_key` without -//! a storage lookup, so operators can use it to call `CreateApiKey` and then -//! rotate it out once real keys are in place. -//! -//! When a request is authenticated, the middleware injects a `ValidatedKeyId` -//! extension into the HTTP request. Service handlers extract it to perform -//! per-queue ACL checks. - -use std::sync::Arc; -use std::task::{Context as TaskContext, Poll}; - -use fila_core::Broker; -use tower::{Layer, Service}; - -/// Request extension injected by the auth middleware for authenticated requests. -/// -/// Service handlers extract this to perform ACL checks via `broker.check_permission`. -/// Absent when auth is disabled. -#[derive(Clone, Debug)] -pub struct ValidatedKeyId(pub fila_core::broker::auth::CallerKey); - -/// gRPC paths that bypass authentication. Empty — all RPCs require a valid key -/// when auth is enabled. Use `bootstrap_apikey` in config or `FILA_BOOTSTRAP_APIKEY` -/// to provision the first key without a bypass. -const AUTH_BYPASS_PATHS: &[&str] = &[]; - -/// Tower layer that wraps services with API key authentication. -#[derive(Clone)] -pub struct AuthLayer { - broker: Arc, -} - -impl AuthLayer { - pub fn new(broker: Arc) -> Self { - Self { broker } - } -} - -impl Layer for AuthLayer { - type Service = AuthService; - - fn layer(&self, inner: S) -> Self::Service { - AuthService { - inner, - broker: Arc::clone(&self.broker), - } - } -} - -/// Tower service wrapper that validates the `authorization` header. -#[derive(Clone)] -pub struct AuthService { - inner: S, - broker: Arc, -} - -impl Service> for AuthService -where - S: Service, Response = http::Response> - + Clone - + Send - + 'static, - S::Future: Send + 'static, - B: Send + 'static, -{ - type Response = S::Response; - type Error = S::Error; - type Future = futures_core::future::BoxFuture<'static, Result>; - - fn poll_ready(&mut self, cx: &mut TaskContext<'_>) -> Poll> { - self.inner.poll_ready(cx) - } - - fn call(&mut self, mut req: http::Request) -> Self::Future { - // Take the readied service and replace it with a clone so that - // `self.inner` is ready for the next poll_ready/call cycle. - // This satisfies the Tower contract: the instance that passed - // poll_ready must be the one used for the actual call. - let clone = self.inner.clone(); - let mut inner = std::mem::replace(&mut self.inner, clone); - - // When auth is disabled, pass through unconditionally. - if !self.broker.auth_enabled { - return Box::pin(inner.call(req)); - } - - // Check bypass paths (currently empty — all RPCs require auth). - let path = req.uri().path(); - for bypass in AUTH_BYPASS_PATHS { - if path == *bypass { - return Box::pin(inner.call(req)); - } - } - - // Extract the Bearer token from the `authorization` metadata header. - let token = req - .headers() - .get("authorization") - .and_then(|v| v.to_str().ok()) - .and_then(|s| s.strip_prefix("Bearer ")) - .map(|s| s.to_string()); - - let broker = Arc::clone(&self.broker); - - Box::pin(async move { - let token = match token { - Some(t) => t, - None => { - return Ok(unauthenticated_response()); - } - }; - - match broker.validate_api_key(&token) { - Ok(Some(caller)) => { - req.extensions_mut().insert(ValidatedKeyId(caller)); - inner.call(req).await - } - Ok(None) => Ok(unauthenticated_response()), - Err(e) => { - tracing::error!(error = %e, "auth storage error"); - Ok(internal_error_response()) - } - } - }) - } -} - -fn unauthenticated_response() -> http::Response { - tonic::Status::unauthenticated("invalid or missing api key").into_http() -} - -fn internal_error_response() -> http::Response { - tonic::Status::internal("authentication service unavailable").into_http() -} diff --git a/crates/fila-server/src/binary_handlers.rs b/crates/fila-server/src/binary_handlers.rs index 2a37b13d..433bc30b 100644 --- a/crates/fila-server/src/binary_handlers.rs +++ b/crates/fila-server/src/binary_handlers.rs @@ -1055,6 +1055,20 @@ pub fn handle_set_acl( broker: &Broker, req: fila_fibp::SetAclRequest, ) -> Result { + // Validate permission kinds before applying. + for p in &req.permissions { + match p.kind.as_str() { + "produce" | "consume" | "admin" => {} + other => { + return Err(( + ErrorCode::InvalidConfigValue, + format!( + "invalid permission kind \"{other}\": expected produce, consume, or admin" + ), + )); + } + } + } let permissions: Vec<(String, String)> = req .permissions .into_iter() diff --git a/crates/fila-server/src/binary_server.rs b/crates/fila-server/src/binary_server.rs index 4eaebc78..bb6e3dee 100644 --- a/crates/fila-server/src/binary_server.rs +++ b/crates/fila-server/src/binary_server.rs @@ -1,5 +1,5 @@ //! Binary protocol server — accepts TCP connections and dispatches FIBP frames -//! to the broker's scheduler through the same command channel as gRPC. +//! to the broker's scheduler through the command channel. use std::collections::HashMap; use std::sync::Arc; diff --git a/crates/fila-server/src/error.rs b/crates/fila-server/src/error.rs deleted file mode 100644 index 8d4a6abe..00000000 --- a/crates/fila-server/src/error.rs +++ /dev/null @@ -1,103 +0,0 @@ -use fila_core::{ - AckError, BrokerError, ConfigError, CreateQueueError, DeleteQueueError, EnqueueError, - ListQueuesError, NackError, RedriveError, StatsError, -}; -use tonic::Status; - -pub trait IntoStatus { - fn into_status(self) -> Status; -} - -impl IntoStatus for EnqueueError { - fn into_status(self) -> Status { - match self { - EnqueueError::QueueNotFound(msg) => Status::not_found(msg), - EnqueueError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for AckError { - fn into_status(self) -> Status { - match self { - AckError::MessageNotFound(msg) => Status::not_found(msg), - AckError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for NackError { - fn into_status(self) -> Status { - match self { - NackError::MessageNotFound(msg) => Status::not_found(msg), - NackError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for CreateQueueError { - fn into_status(self) -> Status { - match self { - CreateQueueError::QueueAlreadyExists(msg) => Status::already_exists(msg), - CreateQueueError::LuaCompilation(msg) => Status::invalid_argument(msg), - CreateQueueError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for DeleteQueueError { - fn into_status(self) -> Status { - match self { - DeleteQueueError::QueueNotFound(msg) => Status::not_found(msg), - DeleteQueueError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for ConfigError { - fn into_status(self) -> Status { - match self { - ConfigError::InvalidValue(msg) => Status::invalid_argument(msg), - ConfigError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for StatsError { - fn into_status(self) -> Status { - match self { - StatsError::QueueNotFound(msg) => Status::not_found(msg), - StatsError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for RedriveError { - fn into_status(self) -> Status { - match self { - RedriveError::QueueNotFound(msg) => Status::not_found(msg), - RedriveError::NotADLQ(msg) => Status::invalid_argument(msg), - RedriveError::ParentQueueNotFound(msg) => Status::failed_precondition(msg), - RedriveError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for ListQueuesError { - fn into_status(self) -> Status { - match self { - ListQueuesError::Storage(e) => Status::internal(e.to_string()), - } - } -} - -impl IntoStatus for BrokerError { - fn into_status(self) -> Status { - match self { - BrokerError::SchedulerSpawn(msg) => Status::internal(msg), - BrokerError::ChannelFull => Status::resource_exhausted("scheduler overloaded"), - BrokerError::ChannelDisconnected => Status::unavailable("scheduler unavailable"), - BrokerError::SchedulerPanicked => Status::internal("scheduler panicked"), - } - } -} diff --git a/crates/fila-server/src/main.rs b/crates/fila-server/src/main.rs index 46c9621a..4540cbba 100644 --- a/crates/fila-server/src/main.rs +++ b/crates/fila-server/src/main.rs @@ -1,37 +1,11 @@ -mod admin_service; -mod auth; -mod error; -mod service; -mod trace_context; - use fila_server::binary_server; use std::path::Path; use std::sync::Arc; -use fila_core::{Broker, BrokerConfig, RocksDbEngine, TlsParams}; -use fila_proto::fila_admin_server::FilaAdminServer; -use fila_proto::fila_service_server::FilaServiceServer; -use tonic::transport::{Certificate, Identity, Server, ServerTlsConfig}; +use fila_core::{Broker, BrokerConfig, RocksDbEngine}; use tracing::info; -use admin_service::AdminService; -use service::HotPathService; - -/// Build a `ServerTlsConfig` from `TlsParams`. -/// `ca_file` present → mTLS (verify client certs); absent → server-TLS only. -async fn load_server_tls(tls: &TlsParams) -> Result> { - let cert_pem = tokio::fs::read(&tls.cert_file).await?; - let key_pem = tokio::fs::read(&tls.key_file).await?; - let identity = Identity::from_pem(cert_pem, key_pem); - let mut server_tls = ServerTlsConfig::new().identity(identity); - if let Some(ref ca_file) = tls.ca_file { - let ca_pem = tokio::fs::read(ca_file).await?; - server_tls = server_tls.client_ca_root(Certificate::from_pem(ca_pem)); - } - Ok(server_tls) -} - fn load_config() -> BrokerConfig { let paths = ["fila.toml", "/etc/fila/fila.toml"]; @@ -62,11 +36,8 @@ fn load_config() -> BrokerConfig { #[tokio::main] async fn main() -> Result<(), Box> { - // Install the ring CryptoProvider before any TLS operations. - // Both `ring` and `aws-lc-rs` features are enabled in the dependency tree - // (tonic uses ring via tls-ring, tokio-rustls pulls both), so rustls cannot - // auto-detect which to use. Explicitly installing ring matches tonic's choice. - rustls::crypto::ring::default_provider() + // Install the aws-lc-rs CryptoProvider before any TLS operations. + rustls::crypto::aws_lc_rs::default_provider() .install_default() .expect("install rustls CryptoProvider"); @@ -107,11 +78,7 @@ async fn main() -> Result<(), Box> { Arc::clone(&storage), Some(meta_event_tx), tls_params.as_ref(), - config - .server - .binary_addr - .as_deref() - .unwrap_or(&config.server.listen_addr), + &config.server.listen_addr, ) .await?, ) @@ -121,11 +88,10 @@ async fn main() -> Result<(), Box> { let cluster_handle = cluster_manager.as_ref().map(|cm| cm.handle()); - let binary_addr_cfg = config.server.binary_addr.clone(); let broker = Arc::new(Broker::new(config, Arc::clone(&storage))?); - // Wire cluster ↔ broker integration: - // 1. Give the cluster gRPC service access to the Broker so forwarded + // Wire cluster <-> broker integration: + // 1. Give the cluster binary service access to the Broker so forwarded // writes can be applied to the leader's local scheduler. // 2. Start meta event handler for queue group lifecycle. // 3. Start leader change watcher for failover (recover queue / drop consumers). @@ -152,64 +118,44 @@ async fn main() -> Result<(), Box> { )); } - let admin_service = AdminService::new(Arc::clone(&broker), cluster_handle.clone()); - let hot_path_service = HotPathService::new(Arc::clone(&broker), cluster_handle.clone()); - - // --- Binary protocol server (optional, enabled via [server].binary_addr) --- - let binary_shutdown_tx = if let Some(binary_addr) = binary_addr_cfg { - let binary_listener = tokio::net::TcpListener::bind(&binary_addr).await?; - info!(%binary_addr, "binary protocol listener bound"); - - let binary_tls = if let Some(ref tls) = tls_params { - Some(binary_server::build_tls_acceptor(tls).await?) - } else { - None - }; - - let node_id = if cluster_config.enabled { - cluster_config.node_id - } else { - 0 - }; - let binary = Arc::new(binary_server::BinaryServer::new( - Arc::clone(&broker), - cluster_handle.clone(), - node_id, - )); + // --- Binary protocol server (mandatory) --- + let binary_listener = tokio::net::TcpListener::bind(&listen_addr).await?; + info!(%listen_addr, "binary protocol listener bound"); - let (tx, rx) = tokio::sync::watch::channel(false); - tokio::spawn(binary_server::run(binary, binary_listener, binary_tls, rx)); - Some(tx) + let binary_tls = if let Some(ref tls) = tls_params { + Some(binary_server::build_tls_acceptor(tls).await?) } else { None }; - // --- gRPC server (temporary, for cluster comms until Story 20.4) --- - let grpc_addr = listen_addr.parse()?; - info!(%grpc_addr, tls = tls_params.is_some(), "starting gRPC server"); + let node_id = if cluster_config.enabled { + cluster_config.node_id + } else { + 0 + }; + let binary = Arc::new(binary_server::BinaryServer::new( + Arc::clone(&broker), + cluster_handle.clone(), + node_id, + )); - let mut server_builder = Server::builder(); - if let Some(ref tls) = tls_params { - let server_tls = load_server_tls(tls).await?; - server_builder = server_builder.tls_config(server_tls)?; - } + let (shutdown_tx, shutdown_rx) = tokio::sync::watch::channel(false); - // Layer order: last `.layer()` becomes outermost (first to receive requests). - // AuthLayer must be inner so auth runs within the trace context span. - let serve_result = server_builder - .layer(auth::AuthLayer::new(Arc::clone(&broker))) - .layer(trace_context::TraceContextLayer) - .add_service(FilaAdminServer::new(admin_service)) - .add_service(FilaServiceServer::new(hot_path_service)) - .serve_with_shutdown(grpc_addr, shutdown_signal()) - .await; - - info!("gRPC server stopped, shutting down broker"); - if let Some(tx) = binary_shutdown_tx { - let _ = tx.send(true); - } + let server_handle = tokio::spawn(binary_server::run( + binary, + binary_listener, + binary_tls, + shutdown_rx, + )); - // Graceful broker shutdown — Drop impl will handle it since Arc may have refs + // Wait for shutdown signal. + shutdown_signal().await; + + info!("shutting down binary protocol server"); + let _ = shutdown_tx.send(true); + let _ = server_handle.await; + + // Graceful broker shutdown drop(broker); // Shut down leader change watcher and Raft before exiting. @@ -218,8 +164,6 @@ async fn main() -> Result<(), Box> { cm.shutdown().await; } - serve_result?; - // Flush OTel pipeline (spans + metrics) before exit if let Some(guard) = telemetry_guard { guard.shutdown(); diff --git a/crates/fila-server/src/service.rs b/crates/fila-server/src/service.rs deleted file mode 100644 index a2ffa248..00000000 --- a/crates/fila-server/src/service.rs +++ /dev/null @@ -1,531 +0,0 @@ -use std::sync::Arc; - -use fila_core::broker::command::{AckItem, NackItem}; -use fila_core::cluster::{AckItemData, NackItemData}; -use fila_core::{ - Broker, ClusterHandle, ClusterRequest, ClusterResponse, ClusterWriteError, Message, - ReadyMessage, SchedulerCommand, -}; -use fila_proto::fila_service_server::FilaService; -use fila_proto::{ - AckRequest, AckResponse, ConsumeRequest, ConsumeResponse, EnqueueRequest, EnqueueResponse, - NackRequest, NackResponse, -}; -use tonic::{Request, Response, Status}; -use tracing::{debug, instrument}; -use uuid::Uuid; - -use crate::auth::ValidatedKeyId; -use crate::error::IntoStatus; - -/// Map cluster write errors to appropriate gRPC status codes. -fn cluster_write_err_to_status(err: ClusterWriteError) -> Status { - match err { - ClusterWriteError::QueueGroupNotFound => Status::not_found("queue raft group not found"), - ClusterWriteError::NodeNotReady => { - Status::unavailable("node not ready for this queue — retry on another node or wait") - } - ClusterWriteError::NoLeader => Status::unavailable("no leader available"), - ClusterWriteError::Raft(e) => Status::internal(format!("raft error: {e}")), - ClusterWriteError::Forward(e) => Status::unavailable(format!("forward error: {e}")), - } -} - -/// gRPC hot-path service implementation for producers and consumers. -pub struct HotPathService { - broker: Arc, - cluster: Option>, -} - -impl HotPathService { - pub fn new(broker: Arc, cluster: Option>) -> Self { - Self { broker, cluster } - } -} - -/// Convert a ReadyMessage to a protobuf Message for the ConsumeResponse. -fn ready_to_proto(ready: ReadyMessage) -> fila_proto::Message { - fila_proto::Message { - id: ready.msg_id.to_string(), - headers: ready.headers, - payload: ready.payload, - metadata: Some(fila_proto::MessageMetadata { - fairness_key: ready.fairness_key, - weight: ready.weight, - throttle_keys: ready.throttle_keys, - attempt_count: ready.attempt_count, - queue_id: ready.queue_id, - }), - timestamps: None, - } -} -#[tonic::async_trait] -impl FilaService for HotPathService { - #[instrument(skip(self, request), fields(queue_id, msg_id))] - async fn enqueue( - &self, - request: Request, - ) -> Result, Status> { - let caller = request - .extensions() - .get::() - .map(|k| k.0.clone()); - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // ACL check: produce permission on this queue. - if let Some(ref caller) = caller { - let permitted = self - .broker - .check_permission( - caller, - fila_core::broker::auth::Permission::Produce, - &req.queue, - ) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !permitted { - return Err(Status::permission_denied(format!( - "key does not have produce permission on queue \"{}\"", - req.queue - ))); - } - } - - let now_ns = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_nanos() as u64; - - let msg_id = Uuid::now_v7(); - tracing::Span::current().record("queue_id", req.queue.as_str()); - tracing::Span::current().record("msg_id", tracing::field::display(&msg_id)); - - let message = Message { - id: msg_id, - queue_id: req.queue.clone(), - headers: req.headers, - payload: req.payload, - fairness_key: "default".to_string(), - weight: 1, - throttle_keys: vec![], - attempt_count: 0, - enqueued_at: now_ns, - leased_at: None, - }; - - if let Some(ref cluster) = self.cluster { - // Cluster mode: commit through the queue's Raft group. - let result = cluster - .write_to_queue( - &req.queue, - ClusterRequest::Enqueue { - messages: vec![message.clone()], - }, - ) - .await - .map_err(cluster_write_err_to_status)?; - - let msg_id = match result.response { - ClusterResponse::Enqueue { msg_id } => msg_id, - ClusterResponse::Error { message } => { - return Err(Status::internal(message)); - } - _ => return Err(Status::internal("unexpected cluster response")), - }; - - // Apply to local scheduler only if this node handled the - // write (is the leader). Forwarded writes are applied by - // the leader's ClientWrite handler. - if result.handled_locally { - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Enqueue { - messages: vec![message], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap single result - results - .into_iter() - .next() - .unwrap_or(Err(fila_core::EnqueueError::QueueNotFound( - "unknown".into(), - ))) - .map_err(IntoStatus::into_status)?; - } - - Ok(Response::new(EnqueueResponse { - message_id: msg_id.to_string(), - })) - } else { - // Single-node mode: direct to scheduler. - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Enqueue { - messages: vec![message], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap batch-of-1 result - let msg_id = results - .into_iter() - .next() - .unwrap_or(Err(fila_core::EnqueueError::QueueNotFound( - "unknown".into(), - ))) - .map_err(IntoStatus::into_status)?; - - Ok(Response::new(EnqueueResponse { - message_id: msg_id.to_string(), - })) - } - } - - type ConsumeStream = tokio_stream::wrappers::ReceiverStream>; - - #[instrument(skip(self, request), fields(queue_id))] - async fn consume( - &self, - request: Request, - ) -> Result, Status> { - let caller = request - .extensions() - .get::() - .map(|k| k.0.clone()); - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // ACL check: consume permission on this queue. - if let Some(ref caller) = caller { - let permitted = self - .broker - .check_permission( - caller, - fila_core::broker::auth::Permission::Consume, - &req.queue, - ) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !permitted { - return Err(Status::permission_denied(format!( - "key does not have consume permission on queue \"{}\"", - req.queue - ))); - } - } - - tracing::Span::current().record("queue_id", req.queue.as_str()); - - // In cluster mode, consumers must connect to the queue's Raft leader - // so the DRR scheduler on the leader can serve them directly. - if let Some(ref cluster) = self.cluster { - match cluster.is_queue_leader(&req.queue).await { - Some(true) => { /* This node is the leader — proceed */ } - Some(false) => { - let mut status = Status::unavailable( - "this node is not the leader for this queue; reconnect to the leader", - ); - // Include the leader's client address in gRPC metadata so - // SDKs can transparently reconnect to the correct node. - if let Some(addr) = cluster.get_queue_leader_client_addr(&req.queue).await { - // Don't advertise wildcard addresses (0.0.0.0) — they're - // not routable from clients. Only send the hint when the - // leader's address is a concrete host. - if !addr.starts_with("0.0.0.0") { - if let Ok(val) = addr.parse() { - status.metadata_mut().insert("x-fila-leader-addr", val); - } - } - } - return Err(status); - } - None => { - return Err(Status::not_found("queue raft group not found")); - } - } - } - - let consumer_id = Uuid::now_v7().to_string(); - - // Channel from scheduler (ReadyMessage) to converter task - let (ready_tx, mut ready_rx) = tokio::sync::mpsc::channel::(64); - - // Channel from converter task to gRPC stream - let (stream_tx, stream_rx) = tokio::sync::mpsc::channel(64); - - // Register consumer with the scheduler - self.broker - .send_command(SchedulerCommand::RegisterConsumer { - queue_id: req.queue.clone(), - consumer_id: consumer_id.clone(), - tx: ready_tx, - }) - .map_err(IntoStatus::into_status)?; - - debug!(%consumer_id, queue = %req.queue, "consume stream opened"); - - // Spawn a converter task that bridges ReadyMessage -> ConsumeResponse - // and handles cleanup on disconnect - let broker = Arc::clone(&self.broker); - let cid = consumer_id.clone(); - tokio::spawn(async move { - loop { - tokio::select! { - msg = ready_rx.recv() => { - match msg { - Some(ready) => { - let response = ConsumeResponse { - message: Some(ready_to_proto(ready)), - }; - if stream_tx.send(Ok(response)).await.is_err() { - break; // gRPC stream closed - } - } - None => break, // Scheduler dropped our sender - } - } - _ = stream_tx.closed() => { - break; // Client disconnected - } - } - } - // Unregister consumer when the stream ends - debug!(consumer_id = %cid, "consume stream closed, unregistering consumer"); - let _ = broker.send_command(SchedulerCommand::UnregisterConsumer { consumer_id: cid }); - }); - - Ok(Response::new(tokio_stream::wrappers::ReceiverStream::new( - stream_rx, - ))) - } - - #[instrument(skip(self, request), fields(queue_id, msg_id))] - async fn ack(&self, request: Request) -> Result, Status> { - let caller = request - .extensions() - .get::() - .map(|k| k.0.clone()); - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // ACL check: consume permission on this queue (ack is part of the consume lifecycle). - if let Some(ref caller) = caller { - let permitted = self - .broker - .check_permission( - caller, - fila_core::broker::auth::Permission::Consume, - &req.queue, - ) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !permitted { - return Err(Status::permission_denied(format!( - "key does not have consume permission on queue \"{}\"", - req.queue - ))); - } - } - if req.message_id.is_empty() { - return Err(Status::invalid_argument("message_id must not be empty")); - } - - tracing::Span::current().record("queue_id", req.queue.as_str()); - tracing::Span::current().record("msg_id", req.message_id.as_str()); - - let msg_id: Uuid = req - .message_id - .parse() - .map_err(|_| Status::invalid_argument("invalid message_id format"))?; - - let ack_item = AckItem { - queue_id: req.queue.clone(), - msg_id, - }; - - if let Some(ref cluster) = self.cluster { - // Cluster mode: commit through queue Raft. - let result = cluster - .write_to_queue( - &ack_item.queue_id, - ClusterRequest::Ack { - items: vec![AckItemData { - queue_id: ack_item.queue_id.clone(), - msg_id: ack_item.msg_id, - }], - }, - ) - .await - .map_err(cluster_write_err_to_status)?; - - if let ClusterResponse::Error { message } = result.response { - return Err(Status::internal(message)); - } - - if result.handled_locally { - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Ack { - items: vec![ack_item], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap single result - results - .into_iter() - .next() - .unwrap_or(Err(fila_core::AckError::MessageNotFound("unknown".into()))) - .map_err(IntoStatus::into_status)?; - } - } else { - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Ack { - items: vec![ack_item], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap batch-of-1 result - results - .into_iter() - .next() - .unwrap_or(Err(fila_core::AckError::MessageNotFound("unknown".into()))) - .map_err(IntoStatus::into_status)?; - } - - Ok(Response::new(AckResponse {})) - } - - #[instrument(skip(self, request), fields(queue_id, msg_id))] - async fn nack(&self, request: Request) -> Result, Status> { - let caller = request - .extensions() - .get::() - .map(|k| k.0.clone()); - let req = request.into_inner(); - - if req.queue.is_empty() { - return Err(Status::invalid_argument("queue name must not be empty")); - } - - // ACL check: consume permission on this queue (nack is part of the consume lifecycle). - if let Some(ref caller) = caller { - let permitted = self - .broker - .check_permission( - caller, - fila_core::broker::auth::Permission::Consume, - &req.queue, - ) - .map_err(|e| Status::internal(format!("acl check error: {e}")))?; - if !permitted { - return Err(Status::permission_denied(format!( - "key does not have consume permission on queue \"{}\"", - req.queue - ))); - } - } - - if req.message_id.is_empty() { - return Err(Status::invalid_argument("message_id must not be empty")); - } - - tracing::Span::current().record("queue_id", req.queue.as_str()); - tracing::Span::current().record("msg_id", req.message_id.as_str()); - - let msg_id: Uuid = req - .message_id - .parse() - .map_err(|_| Status::invalid_argument("invalid message_id format"))?; - - let nack_item = NackItem { - queue_id: req.queue.clone(), - msg_id, - error: req.error.clone(), - }; - - if let Some(ref cluster) = self.cluster { - // Cluster mode: commit through queue Raft. - let result = cluster - .write_to_queue( - &nack_item.queue_id, - ClusterRequest::Nack { - items: vec![NackItemData { - queue_id: nack_item.queue_id.clone(), - msg_id: nack_item.msg_id, - error: nack_item.error.clone(), - }], - }, - ) - .await - .map_err(cluster_write_err_to_status)?; - - if let ClusterResponse::Error { message } = result.response { - return Err(Status::internal(message)); - } - - if result.handled_locally { - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Nack { - items: vec![nack_item], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap single result - results - .into_iter() - .next() - .unwrap_or(Err(fila_core::NackError::MessageNotFound("unknown".into()))) - .map_err(IntoStatus::into_status)?; - } - } else { - let (reply_tx, reply_rx) = tokio::sync::oneshot::channel(); - self.broker - .send_command(SchedulerCommand::Nack { - items: vec![nack_item], - reply: reply_tx, - }) - .map_err(IntoStatus::into_status)?; - - let results = reply_rx - .await - .map_err(|_| Status::internal("scheduler reply channel dropped"))?; - // Unwrap batch-of-1 result - results - .into_iter() - .next() - .unwrap_or(Err(fila_core::NackError::MessageNotFound("unknown".into()))) - .map_err(IntoStatus::into_status)?; - } - - Ok(Response::new(NackResponse {})) - } -} diff --git a/crates/fila-server/src/trace_context.rs b/crates/fila-server/src/trace_context.rs deleted file mode 100644 index 595e295a..00000000 --- a/crates/fila-server/src/trace_context.rs +++ /dev/null @@ -1,99 +0,0 @@ -//! Tower layer that extracts W3C trace context (traceparent/tracestate) -//! from incoming gRPC request HTTP headers and propagates it to the -//! current OpenTelemetry context. -//! -//! This completes the deferred item from Story 6.1: the W3C -//! TraceContextPropagator is registered globally in `telemetry.rs`, -//! and this layer uses it to extract parent context from incoming -//! requests so that handler spans link to the caller's trace. - -use std::task::{Context as TaskContext, Poll}; - -use opentelemetry::propagation::Extractor; -use tower::{Layer, Service}; - -/// Extracts W3C trace context headers from an HTTP header map. -struct HeaderExtractor<'a>(&'a http::HeaderMap); - -impl Extractor for HeaderExtractor<'_> { - fn get(&self, key: &str) -> Option<&str> { - self.0.get(key).and_then(|v| v.to_str().ok()) - } - - fn keys(&self) -> Vec<&str> { - self.0.keys().map(|k| k.as_str()).collect() - } -} - -/// Tower layer that wraps services with W3C trace context extraction. -#[derive(Clone)] -pub struct TraceContextLayer; - -impl Layer for TraceContextLayer { - type Service = TraceContextService; - - fn layer(&self, inner: S) -> Self::Service { - TraceContextService { inner } - } -} - -/// Tower service wrapper that extracts W3C trace context from incoming -/// HTTP headers and creates a parent tracing span linked to the caller's -/// trace. All downstream handler spans become children of this span. -#[derive(Clone)] -pub struct TraceContextService { - inner: S, -} - -impl Service> for TraceContextService -where - S: Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = Instrumented; - - fn poll_ready(&mut self, cx: &mut TaskContext<'_>) -> Poll> { - self.inner.poll_ready(cx) - } - - fn call(&mut self, req: http::Request) -> Self::Future { - // Extract W3C trace context from HTTP headers using the globally - // registered TraceContextPropagator (set in telemetry.rs). - let parent_cx = opentelemetry::global::get_text_map_propagator(|prop| { - prop.extract(&HeaderExtractor(req.headers())) - }); - - // Attach the extracted context as current so the tracing span - // picks it up as its parent (via tracing-opentelemetry). - let _guard = parent_cx.attach(); - let span = tracing::info_span!("grpc.server"); - - // The span now holds a reference to the parent context; the guard - // can be dropped safely. Instrument the inner future with this span - // so all handler spans become children. - Instrumented { - inner: self.inner.call(req), - span, - } - } -} - -// Future wrapper that instruments an inner future with a tracing span. -pin_project_lite::pin_project! { - pub struct Instrumented { - #[pin] - inner: F, - span: tracing::Span, - } -} - -impl std::future::Future for Instrumented { - type Output = F::Output; - - fn poll(self: std::pin::Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll { - let this = self.project(); - let _enter = this.span.enter(); - this.inner.poll(cx) - } -}