| {status.agent_name} |
+ {status.model} |
{status.error && (
<>
diff --git a/resources/ts/components/Dashboard.module.css b/resources/ts/components/Dashboard.module.css
index 86f49b91..85458e4e 100644
--- a/resources/ts/components/Dashboard.module.css
+++ b/resources/ts/components/Dashboard.module.css
@@ -7,11 +7,29 @@
th {
border: 1px solid var(--color-border);
padding: var(--spacing-base);
+ cursor: pointer;
p + p {
margin-top: var(--spacing-half);
}
}
+
+ th:hover {
+ background-color: var(--color-hover);
+ }
+}
+
+.sortIndicator {
+ margin-left: var(--spacing-half);
+ font-size: 0.8em;
+}
+
+.sortIndicatorAsc {
+ color: var(--color-success);
+}
+
+.sortIndicatorDesc {
+ color: var(--color-error);
}
.agentRow.agentRowError {
diff --git a/resources/ts/schemas/Agent.ts b/resources/ts/schemas/Agent.ts
index 678bd124..c2eea84d 100644
--- a/resources/ts/schemas/Agent.ts
+++ b/resources/ts/schemas/Agent.ts
@@ -5,6 +5,7 @@ import { StatusUpdateSchema } from "./StatusUpdate";
export const AgentSchema = z
.object({
agent_id: z.string(),
+ model: z.string().nullable(),
last_update: z.object({
nanos_since_epoch: z.number(),
secs_since_epoch: z.number(),
diff --git a/resources/ts/schemas/StatusUpdate.ts b/resources/ts/schemas/StatusUpdate.ts
index 7543d925..52747e5b 100644
--- a/resources/ts/schemas/StatusUpdate.ts
+++ b/resources/ts/schemas/StatusUpdate.ts
@@ -14,6 +14,7 @@ export const StatusUpdateSchema = z
is_unexpected_response_status: z.boolean().nullable(),
slots_idle: z.number(),
slots_processing: z.number(),
+ model: z.string().nullable(),
})
.strict();
diff --git a/src/agent/monitoring_service.rs b/src/agent/monitoring_service.rs
index 4837493f..1eb14658 100644
--- a/src/agent/monitoring_service.rs
+++ b/src/agent/monitoring_service.rs
@@ -23,6 +23,7 @@ pub struct MonitoringService {
monitoring_interval: Duration,
name: Option,
status_update_tx: Sender,
+ check_model: bool, // Store the check_model flag
}
impl MonitoringService {
@@ -32,6 +33,7 @@ impl MonitoringService {
monitoring_interval: Duration,
name: Option,
status_update_tx: Sender,
+ check_model: bool, // Include the check_model flag
) -> Result {
Ok(MonitoringService {
external_llamacpp_addr,
@@ -39,6 +41,7 @@ impl MonitoringService {
monitoring_interval,
name,
status_update_tx,
+ check_model,
})
}
@@ -50,6 +53,15 @@ impl MonitoringService {
.filter(|slot| slot.is_processing)
.count();
+ let model: Option = if self.check_model {
+ match self.llamacpp_client.get_model().await {
+ Ok(model) => model,
+ Err(_) => None,
+ }
+ } else {
+ Some("".to_string())
+ };
+
StatusUpdate {
agent_name: self.name.to_owned(),
error: slots_response.error,
@@ -63,6 +75,7 @@ impl MonitoringService {
is_unexpected_response_status: slots_response.is_unexpected_response_status,
slots_idle: slots_response.slots.len() - slots_processing,
slots_processing,
+ model,
}
}
@@ -109,4 +122,4 @@ impl Service for MonitoringService {
fn threads(&self) -> Option {
Some(1)
}
-}
+}
\ No newline at end of file
diff --git a/src/balancer/proxy_service.rs b/src/balancer/proxy_service.rs
index cfab2dfd..9eeb1a54 100644
--- a/src/balancer/proxy_service.rs
+++ b/src/balancer/proxy_service.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
use async_trait::async_trait;
use bytes::Bytes;
use log::error;
+use log::info;
use pingora::http::RequestHeader;
use pingora::proxy::ProxyHttp;
use pingora::proxy::Session;
@@ -41,6 +42,7 @@ pub struct ProxyService {
buffered_request_timeout: Duration,
max_buffered_requests: usize,
rewrite_host_header: bool,
+ check_model: bool,
slots_endpoint_enable: bool,
upstream_peer_pool: Arc,
}
@@ -48,6 +50,7 @@ pub struct ProxyService {
impl ProxyService {
pub fn new(
rewrite_host_header: bool,
+ check_model: bool,
slots_endpoint_enable: bool,
upstream_peer_pool: Arc,
buffered_request_timeout: Duration,
@@ -55,6 +58,7 @@ impl ProxyService {
) -> Self {
Self {
rewrite_host_header,
+ check_model,
slots_endpoint_enable,
upstream_peer_pool,
buffered_request_timeout,
@@ -73,6 +77,7 @@ impl ProxyHttp for ProxyService {
slot_taken: false,
upstream_peer_pool: self.upstream_peer_pool.clone(),
uses_slots: false,
+ requested_model: Some("".to_string()),
}
}
@@ -180,10 +185,108 @@ impl ProxyHttp for ProxyService {
}
"/chat/completions" => true,
"/completion" => true,
+ "/v1/completions" => true,
"/v1/chat/completions" => true,
_ => false,
};
+ info!("upstream_peer - {:?} request | rewrite_host_header? {} check_model? {}", session.req_header().method, self.rewrite_host_header, self.check_model);
+
+ // Check if the request method is POST and the content type is JSON
+ if self.check_model && ctx.uses_slots {
+ info!("Checking model...");
+ ctx.requested_model = None;
+ if session.req_header().method == "POST" {
+ // Check if the content type is application/json
+ if let Some(content_type) = session.get_header("Content-Type") {
+ if let Ok(content_type_str) = content_type.to_str() {
+ if content_type_str.contains("application/json") {
+ // Enable retry buffering to preserve the request body, reference: https://github.com/cloudflare/pingora/issues/349#issuecomment-2377277028
+ session.enable_retry_buffering();
+ session.read_body_or_idle(false).await.unwrap().unwrap();
+ let request_body = session.get_retry_buffer();
+
+ if let Some(body_bytes) = request_body {
+ match std::str::from_utf8(&body_bytes) {
+ Ok(_) => {
+ // The bytes are valid UTF-8, proceed as normal
+ if let Ok(json_value) = serde_json::from_slice::(&body_bytes) {
+ if let Some(model) = json_value.get("model").and_then(|v| v.as_str()) {
+ ctx.requested_model = Some(model.to_string());
+ info!("Model in request: {:?}", ctx.requested_model);
+ }
+ } else {
+ info!("Failed to parse JSON payload, trying regex extraction");
+ let body_str = String::from_utf8_lossy(&body_bytes).to_string();
+ let re = regex::Regex::new(r#""model"\s*:\s*["']([^"']*)["']"#).unwrap();
+ if let Some(caps) = re.captures(&body_str) {
+ if let Some(model) = caps.get(1) {
+ ctx.requested_model = Some(model.as_str().to_string());
+ info!("Model via regex: {:?}", ctx.requested_model);
+ }
+ } else {
+ info!("Failed to extract model using regex");
+ }
+ }
+ },
+ Err(e) => {
+ // Invalid UTF-8 detected. Truncate to the last valid UTF-8 boundary.
+ let valid_up_to = e.valid_up_to();
+ info!("Invalid UTF-8 detected. Truncating from {} bytes to {} bytes.", body_bytes.len(), valid_up_to);
+
+ // Create a new `Bytes` slice containing only the valid UTF-8 part.
+ let valid_body_bytes = body_bytes.slice(0..valid_up_to);
+
+ // Now proceed with the (truncated) valid_body_bytes
+ if let Ok(json_value) = serde_json::from_slice::(&valid_body_bytes) {
+ if let Some(model) = json_value.get("model").and_then(|v| v.as_str()) {
+ ctx.requested_model = Some(model.to_string());
+ info!("Model in request (after truncation): {:?}", ctx.requested_model);
+ }
+ } else {
+ info!("Failed to parse JSON payload (after truncation), trying regex extraction");
+ let body_str = String::from_utf8_lossy(&valid_body_bytes).to_string();
+ let re = regex::Regex::new(r#""model"\s*:\s*["']([^"']*)["']"#).unwrap();
+ if let Some(caps) = re.captures(&body_str) {
+ if let Some(model) = caps.get(1) {
+ ctx.requested_model = Some(model.as_str().to_string());
+ info!("Model via regex (after truncation): {:?}", ctx.requested_model);
+ }
+ } else {
+ info!("Failed to extract model using regex (after truncation)");
+ }
+ }
+ }
+ }
+ } else {
+ info!("Request body is None");
+ }
+ }
+ }
+ }
+ }
+ // abort if model has not been set
+ if ctx.requested_model == None {
+ info!("Model missing in request");
+ session
+ .respond_error(pingora::http::StatusCode::BAD_REQUEST.as_u16())
+ .await?;
+
+ return Err(Error::new_down(pingora::ErrorType::ConnectRefused));
+ }
+ else if ctx.has_peer_supporting_model() == false {
+ info!("Model {:?} not supported by upstream", ctx.requested_model);
+ session
+ .respond_error(pingora::http::StatusCode::NOT_FOUND.as_u16())
+ .await?;
+
+ return Err(Error::new_down(pingora::ErrorType::ConnectRefused));
+ }
+ else {
+ info!("Model {:?}", ctx.requested_model);
+ }
+ }
+
let peer = tokio::select! {
result = async {
loop {
diff --git a/src/balancer/request_context.rs b/src/balancer/request_context.rs
index eb8b1c11..da308717 100644
--- a/src/balancer/request_context.rs
+++ b/src/balancer/request_context.rs
@@ -2,6 +2,7 @@ use std::sync::Arc;
use anyhow::anyhow;
use log::error;
+use log::info;
use pingora::Error;
use pingora::Result;
@@ -13,6 +14,7 @@ pub struct RequestContext {
pub selected_peer: Option,
pub upstream_peer_pool: Arc,
pub uses_slots: bool,
+ pub requested_model: Option,
}
impl RequestContext {
@@ -30,16 +32,19 @@ impl RequestContext {
}
}
- pub fn use_best_peer_and_take_slot(&mut self) -> anyhow::Result |