diff --git a/Makefile b/Makefile index 939d825..a788364 100644 --- a/Makefile +++ b/Makefile @@ -7,24 +7,30 @@ RUST_DIR := voice API_DIR := studio/api WEB_DIR := studio/web +ifeq (, $(shell command -v uvx 2> /dev/null)) +$(error "uvx could not be found. Please install uv (https://docs.astral.sh/uv/) before proceeding") +endif + +PROTOC := uvx --python 3.12 --from grpcio-tools==1.80.0 python -m grpc_tools.protoc + # ── Protobuf ───────────────────────────────────────────────────── .PHONY: proto proto: ## Generate Python, TS, and ML stubs from proto definitions # ML layer mkdir -p $(ML_DIR)/stt - protoc \ + $(PROTOC) \ --python_out=$(ML_DIR)/stt \ --pyi_out=$(ML_DIR)/stt \ --proto_path=$(PROTO_DIR) \ $(PROTO_DIR)/stt.proto # Studio API layer - protoc \ + $(PROTOC) \ --python_out=$(API_DIR)/app/schemas \ --pyi_out=$(API_DIR)/app/schemas \ --proto_path=$(PROTO_DIR) \ $(PROTO_DIR)/agent.proto # Studio Web layer (TS interfaces) - protoc \ + $(PROTOC) \ --plugin=protoc-gen-ts_proto=$(WEB_DIR)/node_modules/.bin/protoc-gen-ts_proto \ --ts_proto_out=$(WEB_DIR)/src/lib/api \ --ts_proto_opt=esModuleInterop=true,forceLong=string,outputServices=false,outputJsonMethods=false,outputClientImpl=false,outputEncodeMethods=false,outputPartialMethods=false,outputTypeRegistry=false,onlyTypes=true,snakeToCamel=false \ diff --git a/inference/stt/stt_pb2.py b/inference/stt/stt_pb2.py index 3e36eb5..74f0dfb 100644 --- a/inference/stt/stt_pb2.py +++ b/inference/stt/stt_pb2.py @@ -1,12 +1,22 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE # source: stt.proto -# Protobuf Python Version: 4.25.0 +# Protobuf Python Version: 6.31.1 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'stt.proto' +) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -19,8 +29,8 @@ _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'stt_pb2', _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None _globals['_STTREQUEST']._serialized_start=18 _globals['_STTREQUEST']._serialized_end=144 _globals['_AUDIODATA']._serialized_start=146 diff --git a/inference/stt/stt_pb2.pyi b/inference/stt/stt_pb2.pyi index 52440ee..a02e8fd 100644 --- a/inference/stt/stt_pb2.pyi +++ b/inference/stt/stt_pb2.pyi @@ -1,6 +1,7 @@ from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message -from typing import ClassVar as _ClassVar, Mapping as _Mapping, Optional as _Optional, Union as _Union +from collections.abc import Mapping as _Mapping +from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union DESCRIPTOR: _descriptor.FileDescriptor diff --git a/proto/agent.proto b/proto/agent.proto index b36fe24..6c31f5b 100644 --- a/proto/agent.proto +++ b/proto/agent.proto @@ -47,6 +47,9 @@ message ToolDef { bool cancel_on_barge_in = 4; // If true, this tool has side effects bool side_effect = 5; + // When true, the runtime passes the result through an LLM summarizer + // before feeding it back. Adds ~5-10s latency. Default false = truncate. + bool summarize_result = 6; } // A single node in the graph @@ -54,7 +57,7 @@ message NodeDef { string system_prompt = 1; repeated string tools = 2; // keys referencing AgentGraphDef.tools repeated string edges = 3; // node IDs this node can transfer to - + optional string model = 4; optional double temperature = 5; optional uint32 max_tokens = 6; @@ -66,23 +69,23 @@ message NodeDef { message AgentGraphDef { // The ID of the node to start with string entry = 1; - + // All nodes keyed by ID map nodes = 2; - + // All tool definitions keyed by tool name map tools = 3; - + // -- Agent-wide settings -- optional string language = 4; // ISO 639-1 optional string timezone = 5; // IANA timezone optional string voice_id = 6; // Default TTS voice ID optional string tts_provider = 7; // e.g. "elevenlabs" optional string tts_model = 8; // e.g. "eleven_turbo_v2" - + // Session recording configuration optional RecordingConfig recording = 9; - + // Envelope field for versions (e.g. "v3_graph") optional string config_schema_version = 10; diff --git a/studio/api/app/agent_builder/edit_ops.py b/studio/api/app/agent_builder/edit_ops.py index 642048a..f4ac007 100644 --- a/studio/api/app/agent_builder/edit_ops.py +++ b/studio/api/app/agent_builder/edit_ops.py @@ -30,7 +30,13 @@ # ── Tool field whitelist ───────────────────────────────────────── -_TOOL_FIELD_ALLOWED = {"description", "params", "script", "side_effect"} +_TOOL_FIELD_ALLOWED = { + "description", + "params", + "script", + "side_effect", + "summarize_result", +} # ── Canonical field ordering ───────────────────────────────────── @@ -46,7 +52,7 @@ _NODE_FIELD_ORDER = ["system_prompt", "greeting", "tools", "edges"] -_TOOL_FIELD_ORDER = ["description", "params", "script", "side_effect"] +_TOOL_FIELD_ORDER = ["description", "params", "script", "side_effect", "summarize_result"] def _validate_string_list(value: Any, field_name: str) -> None: @@ -158,6 +164,10 @@ def _validate_fields(self) -> UpsertTool: self.fields["side_effect"], bool ): raise ValueError("'side_effect' must be a boolean") + if "summarize_result" in self.fields and not isinstance( + self.fields["summarize_result"], bool + ): + raise ValueError("'summarize_result' must be a boolean") return self diff --git a/studio/api/app/api/agents.py b/studio/api/app/api/agents.py index b68ffdc..ef957af 100644 --- a/studio/api/app/api/agents.py +++ b/studio/api/app/api/agents.py @@ -1,6 +1,7 @@ """Agent CRUD API routes.""" import asyncio +import copy import uuid from typing import Any @@ -583,6 +584,9 @@ class AgentConfigPatch(BaseModel): tts_provider: str | None = None tts_model: str | None = None gemini_live_model: str | None = None + # tool_id -> true/false + # null means "unset" (falls back to default truncate behavior) + tool_summarize_overrides: dict[str, bool | None] | None = None regenerate_greeting: bool = False @@ -610,10 +614,15 @@ async def patch_agent_config( if not version: raise HTTPException(status_code=404, detail="Active version not found") - config = dict(version.config_json) - patch = body.model_dump(exclude_unset=True, exclude={"regenerate_greeting"}) + # Deep copy is required: shallow copy can mutate nested JSON objects in-place, + # which may prevent SQLAlchemy from detecting a JSONB change. + config = copy.deepcopy(version.config_json) + patch = body.model_dump( + exclude_unset=True, exclude={"regenerate_greeting", "tool_summarize_overrides"} + ) + tool_summarize_overrides = body.tool_summarize_overrides force_regen = body.regenerate_greeting - if not patch and not force_regen: + if not patch and not force_regen and tool_summarize_overrides is None: raise HTTPException(status_code=400, detail="No fields to update") # ── Language validation ────────────────────────────────────────────── @@ -670,6 +679,28 @@ async def patch_agent_config( for key, value in patch.items(): config[key] = value + # ── Per-tool summarize_result patch ─────────────────────────────────────── + if tool_summarize_overrides is not None: + tools_obj = config.get("tools") + if not isinstance(tools_obj, dict): + raise HTTPException( + status_code=422, + detail="Invalid config: top-level 'tools' must be an object", + ) + + for tool_id, summarize in tool_summarize_overrides.items(): + tool_def = tools_obj.get(tool_id) + if not isinstance(tool_def, dict): + raise HTTPException( + status_code=422, + detail=f"Unknown tool '{tool_id}' in tool_summarize_overrides", + ) + + if summarize is None: + tool_def.pop("summarize_result", None) + else: + tool_def["summarize_result"] = bool(summarize) + # ── Greeting regeneration ────────────────────────────────────────── greeting_updated = False new_greeting: str | None = None @@ -704,6 +735,7 @@ async def patch_agent_config( db, agent_id=agent.id, patch=patch, + tool_summarize_overrides=tool_summarize_overrides, greeting_updated=greeting_updated, new_greeting=new_greeting, ) @@ -733,7 +765,8 @@ async def patch_agent_config( async def _inject_config_change_event( db: AsyncSession, agent_id: uuid.UUID, - patch: dict[str, str], + patch: dict[str, Any], + tool_summarize_overrides: dict[str, bool | None] | None = None, greeting_updated: bool = False, new_greeting: str | None = None, ) -> None: @@ -765,6 +798,13 @@ async def _inject_config_change_event( else "Standard Pipeline" ) changes.append(f"conversation mode set to {mode}") + if tool_summarize_overrides: + for tool_id, summarize in tool_summarize_overrides.items(): + if summarize is None: + label = "auto" + else: + label = "enabled" if summarize else "disabled" + changes.append(f"{tool_id} AI summarization set to {label}") if not changes: return diff --git a/studio/api/app/schemas/agent_pb2.py b/studio/api/app/schemas/agent_pb2.py index 3dad1f1..5cfd654 100644 --- a/studio/api/app/schemas/agent_pb2.py +++ b/studio/api/app/schemas/agent_pb2.py @@ -1,12 +1,22 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE # source: agent.proto -# Protobuf Python Version: 4.25.0 +# Protobuf Python Version: 6.31.1 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version from google.protobuf import symbol_database as _symbol_database from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'agent.proto' +) # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -14,34 +24,34 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0b\x61gent.proto\x12\x05\x61gent\"\x8f\x02\n\x0fRecordingConfig\x12\x0f\n\x07\x65nabled\x18\x01 \x01(\x08\x12\x12\n\noutput_uri\x18\x02 \x01(\t\x12(\n\x0c\x61udio_layout\x18\x03 \x01(\x0e\x32\x12.agent.AudioLayout\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12(\n\x0c\x61udio_format\x18\x05 \x01(\x0e\x32\x12.agent.AudioFormat\x12\x19\n\x11max_duration_secs\x18\x06 \x01(\r\x12\x17\n\x0fsave_transcript\x18\x07 \x01(\x08\x12\x1c\n\x14include_tool_details\x18\x08 \x01(\x08\x12\x1c\n\x14include_llm_metadata\x18\t \x01(\x08\"^\n\x08ParamDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12\x10\n\x08required\x18\x04 \x01(\x08\x12\x0f\n\x07options\x18\x05 \x03(\t\"\x80\x01\n\x07ToolDef\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x0e\n\x06script\x18\x02 \x01(\t\x12\x1f\n\x06params\x18\x03 \x03(\x0b\x32\x0f.agent.ParamDef\x12\x1a\n\x12\x63\x61ncel_on_barge_in\x18\x04 \x01(\x08\x12\x13\n\x0bside_effect\x18\x05 \x01(\x08\"\xf6\x01\n\x07NodeDef\x12\x15\n\rsystem_prompt\x18\x01 \x01(\t\x12\r\n\x05tools\x18\x02 \x03(\t\x12\r\n\x05\x65\x64ges\x18\x03 \x03(\t\x12\x12\n\x05model\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x18\n\x0btemperature\x18\x05 \x01(\x01H\x01\x88\x01\x01\x12\x17\n\nmax_tokens\x18\x06 \x01(\rH\x02\x88\x01\x01\x12\x15\n\x08voice_id\x18\x07 \x01(\tH\x03\x88\x01\x01\x12\x15\n\x08greeting\x18\x08 \x01(\tH\x04\x88\x01\x01\x42\x08\n\x06_modelB\x0e\n\x0c_temperatureB\r\n\x0b_max_tokensB\x0b\n\t_voice_idB\x0b\n\t_greeting\"\xb4\x04\n\rAgentGraphDef\x12\r\n\x05\x65ntry\x18\x01 \x01(\t\x12.\n\x05nodes\x18\x02 \x03(\x0b\x32\x1f.agent.AgentGraphDef.NodesEntry\x12.\n\x05tools\x18\x03 \x03(\x0b\x32\x1f.agent.AgentGraphDef.ToolsEntry\x12\x15\n\x08language\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x15\n\x08timezone\x18\x05 \x01(\tH\x01\x88\x01\x01\x12\x15\n\x08voice_id\x18\x06 \x01(\tH\x02\x88\x01\x01\x12\x19\n\x0ctts_provider\x18\x07 \x01(\tH\x03\x88\x01\x01\x12\x16\n\ttts_model\x18\x08 \x01(\tH\x04\x88\x01\x01\x12.\n\trecording\x18\t \x01(\x0b\x32\x16.agent.RecordingConfigH\x05\x88\x01\x01\x12\"\n\x15\x63onfig_schema_version\x18\n \x01(\tH\x06\x88\x01\x01\x1a<\n\nNodesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.agent.NodeDef:\x02\x38\x01\x1a<\n\nToolsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.agent.ToolDef:\x02\x38\x01\x42\x0b\n\t_languageB\x0b\n\t_timezoneB\x0b\n\t_voice_idB\x0f\n\r_tts_providerB\x0c\n\n_tts_modelB\x0c\n\n_recordingB\x18\n\x16_config_schema_version*[\n\x0b\x41udioLayout\x12\x1c\n\x18\x41UDIO_LAYOUT_UNSPECIFIED\x10\x00\x12\x17\n\x13\x41UDIO_LAYOUT_STEREO\x10\x01\x12\x15\n\x11\x41UDIO_LAYOUT_MONO\x10\x02*X\n\x0b\x41udioFormat\x12\x1c\n\x18\x41UDIO_FORMAT_UNSPECIFIED\x10\x00\x12\x15\n\x11\x41UDIO_FORMAT_OPUS\x10\x01\x12\x14\n\x10\x41UDIO_FORMAT_WAV\x10\x02\x42\x30Z.github.com/prime8ai/voice-agent-os/proto/agentb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0b\x61gent.proto\x12\x05\x61gent\"\x8f\x02\n\x0fRecordingConfig\x12\x0f\n\x07\x65nabled\x18\x01 \x01(\x08\x12\x12\n\noutput_uri\x18\x02 \x01(\t\x12(\n\x0c\x61udio_layout\x18\x03 \x01(\x0e\x32\x12.agent.AudioLayout\x12\x13\n\x0bsample_rate\x18\x04 \x01(\r\x12(\n\x0c\x61udio_format\x18\x05 \x01(\x0e\x32\x12.agent.AudioFormat\x12\x19\n\x11max_duration_secs\x18\x06 \x01(\r\x12\x17\n\x0fsave_transcript\x18\x07 \x01(\x08\x12\x1c\n\x14include_tool_details\x18\x08 \x01(\x08\x12\x1c\n\x14include_llm_metadata\x18\t \x01(\x08\"^\n\x08ParamDef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12\x10\n\x08required\x18\x04 \x01(\x08\x12\x0f\n\x07options\x18\x05 \x03(\t\"\x9a\x01\n\x07ToolDef\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x0e\n\x06script\x18\x02 \x01(\t\x12\x1f\n\x06params\x18\x03 \x03(\x0b\x32\x0f.agent.ParamDef\x12\x1a\n\x12\x63\x61ncel_on_barge_in\x18\x04 \x01(\x08\x12\x13\n\x0bside_effect\x18\x05 \x01(\x08\x12\x18\n\x10summarize_result\x18\x06 \x01(\x08\"\xf6\x01\n\x07NodeDef\x12\x15\n\rsystem_prompt\x18\x01 \x01(\t\x12\r\n\x05tools\x18\x02 \x03(\t\x12\r\n\x05\x65\x64ges\x18\x03 \x03(\t\x12\x12\n\x05model\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x18\n\x0btemperature\x18\x05 \x01(\x01H\x01\x88\x01\x01\x12\x17\n\nmax_tokens\x18\x06 \x01(\rH\x02\x88\x01\x01\x12\x15\n\x08voice_id\x18\x07 \x01(\tH\x03\x88\x01\x01\x12\x15\n\x08greeting\x18\x08 \x01(\tH\x04\x88\x01\x01\x42\x08\n\x06_modelB\x0e\n\x0c_temperatureB\r\n\x0b_max_tokensB\x0b\n\t_voice_idB\x0b\n\t_greeting\"\xea\x04\n\rAgentGraphDef\x12\r\n\x05\x65ntry\x18\x01 \x01(\t\x12.\n\x05nodes\x18\x02 \x03(\x0b\x32\x1f.agent.AgentGraphDef.NodesEntry\x12.\n\x05tools\x18\x03 \x03(\x0b\x32\x1f.agent.AgentGraphDef.ToolsEntry\x12\x15\n\x08language\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x15\n\x08timezone\x18\x05 \x01(\tH\x01\x88\x01\x01\x12\x15\n\x08voice_id\x18\x06 \x01(\tH\x02\x88\x01\x01\x12\x19\n\x0ctts_provider\x18\x07 \x01(\tH\x03\x88\x01\x01\x12\x16\n\ttts_model\x18\x08 \x01(\tH\x04\x88\x01\x01\x12.\n\trecording\x18\t \x01(\x0b\x32\x16.agent.RecordingConfigH\x05\x88\x01\x01\x12\"\n\x15\x63onfig_schema_version\x18\n \x01(\tH\x06\x88\x01\x01\x12\x1e\n\x11gemini_live_model\x18\x0c \x01(\tH\x07\x88\x01\x01\x1a<\n\nNodesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.agent.NodeDef:\x02\x38\x01\x1a<\n\nToolsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x1d\n\x05value\x18\x02 \x01(\x0b\x32\x0e.agent.ToolDef:\x02\x38\x01\x42\x0b\n\t_languageB\x0b\n\t_timezoneB\x0b\n\t_voice_idB\x0f\n\r_tts_providerB\x0c\n\n_tts_modelB\x0c\n\n_recordingB\x18\n\x16_config_schema_versionB\x14\n\x12_gemini_live_model*[\n\x0b\x41udioLayout\x12\x1c\n\x18\x41UDIO_LAYOUT_UNSPECIFIED\x10\x00\x12\x17\n\x13\x41UDIO_LAYOUT_STEREO\x10\x01\x12\x15\n\x11\x41UDIO_LAYOUT_MONO\x10\x02*X\n\x0b\x41udioFormat\x12\x1c\n\x18\x41UDIO_FORMAT_UNSPECIFIED\x10\x00\x12\x15\n\x11\x41UDIO_FORMAT_OPUS\x10\x01\x12\x14\n\x10\x41UDIO_FORMAT_WAV\x10\x02\x42\x30Z.github.com/prime8ai/voice-agent-os/proto/agentb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'agent_pb2', _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - _globals['DESCRIPTOR']._options = None +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None _globals['DESCRIPTOR']._serialized_options = b'Z.github.com/prime8ai/voice-agent-os/proto/agent' - _globals['_AGENTGRAPHDEF_NODESENTRY']._options = None + _globals['_AGENTGRAPHDEF_NODESENTRY']._loaded_options = None _globals['_AGENTGRAPHDEF_NODESENTRY']._serialized_options = b'8\001' - _globals['_AGENTGRAPHDEF_TOOLSENTRY']._options = None + _globals['_AGENTGRAPHDEF_TOOLSENTRY']._loaded_options = None _globals['_AGENTGRAPHDEF_TOOLSENTRY']._serialized_options = b'8\001' - _globals['_AUDIOLAYOUT']._serialized_start=1339 - _globals['_AUDIOLAYOUT']._serialized_end=1430 - _globals['_AUDIOFORMAT']._serialized_start=1432 - _globals['_AUDIOFORMAT']._serialized_end=1520 + _globals['_AUDIOLAYOUT']._serialized_start=1419 + _globals['_AUDIOLAYOUT']._serialized_end=1510 + _globals['_AUDIOFORMAT']._serialized_start=1512 + _globals['_AUDIOFORMAT']._serialized_end=1600 _globals['_RECORDINGCONFIG']._serialized_start=23 _globals['_RECORDINGCONFIG']._serialized_end=294 _globals['_PARAMDEF']._serialized_start=296 _globals['_PARAMDEF']._serialized_end=390 _globals['_TOOLDEF']._serialized_start=393 - _globals['_TOOLDEF']._serialized_end=521 - _globals['_NODEDEF']._serialized_start=524 - _globals['_NODEDEF']._serialized_end=770 - _globals['_AGENTGRAPHDEF']._serialized_start=773 - _globals['_AGENTGRAPHDEF']._serialized_end=1337 - _globals['_AGENTGRAPHDEF_NODESENTRY']._serialized_start=1105 - _globals['_AGENTGRAPHDEF_NODESENTRY']._serialized_end=1165 - _globals['_AGENTGRAPHDEF_TOOLSENTRY']._serialized_start=1167 - _globals['_AGENTGRAPHDEF_TOOLSENTRY']._serialized_end=1227 + _globals['_TOOLDEF']._serialized_end=547 + _globals['_NODEDEF']._serialized_start=550 + _globals['_NODEDEF']._serialized_end=796 + _globals['_AGENTGRAPHDEF']._serialized_start=799 + _globals['_AGENTGRAPHDEF']._serialized_end=1417 + _globals['_AGENTGRAPHDEF_NODESENTRY']._serialized_start=1163 + _globals['_AGENTGRAPHDEF_NODESENTRY']._serialized_end=1223 + _globals['_AGENTGRAPHDEF_TOOLSENTRY']._serialized_start=1225 + _globals['_AGENTGRAPHDEF_TOOLSENTRY']._serialized_end=1285 # @@protoc_insertion_point(module_scope) diff --git a/studio/api/app/schemas/agent_pb2.pyi b/studio/api/app/schemas/agent_pb2.pyi index 89cd077..ce5ff37 100644 --- a/studio/api/app/schemas/agent_pb2.pyi +++ b/studio/api/app/schemas/agent_pb2.pyi @@ -2,7 +2,8 @@ from google.protobuf.internal import containers as _containers from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper from google.protobuf import descriptor as _descriptor from google.protobuf import message as _message -from typing import ClassVar as _ClassVar, Iterable as _Iterable, Mapping as _Mapping, Optional as _Optional, Union as _Union +from collections.abc import Iterable as _Iterable, Mapping as _Mapping +from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union DESCRIPTOR: _descriptor.FileDescriptor @@ -61,18 +62,20 @@ class ParamDef(_message.Message): def __init__(self, name: _Optional[str] = ..., type: _Optional[str] = ..., description: _Optional[str] = ..., required: bool = ..., options: _Optional[_Iterable[str]] = ...) -> None: ... class ToolDef(_message.Message): - __slots__ = ("description", "script", "params", "cancel_on_barge_in", "side_effect") + __slots__ = ("description", "script", "params", "cancel_on_barge_in", "side_effect", "summarize_result") DESCRIPTION_FIELD_NUMBER: _ClassVar[int] SCRIPT_FIELD_NUMBER: _ClassVar[int] PARAMS_FIELD_NUMBER: _ClassVar[int] CANCEL_ON_BARGE_IN_FIELD_NUMBER: _ClassVar[int] SIDE_EFFECT_FIELD_NUMBER: _ClassVar[int] + SUMMARIZE_RESULT_FIELD_NUMBER: _ClassVar[int] description: str script: str params: _containers.RepeatedCompositeFieldContainer[ParamDef] cancel_on_barge_in: bool side_effect: bool - def __init__(self, description: _Optional[str] = ..., script: _Optional[str] = ..., params: _Optional[_Iterable[_Union[ParamDef, _Mapping]]] = ..., cancel_on_barge_in: bool = ..., side_effect: bool = ...) -> None: ... + summarize_result: bool + def __init__(self, description: _Optional[str] = ..., script: _Optional[str] = ..., params: _Optional[_Iterable[_Union[ParamDef, _Mapping]]] = ..., cancel_on_barge_in: bool = ..., side_effect: bool = ..., summarize_result: bool = ...) -> None: ... class NodeDef(_message.Message): __slots__ = ("system_prompt", "tools", "edges", "model", "temperature", "max_tokens", "voice_id", "greeting") @@ -95,7 +98,7 @@ class NodeDef(_message.Message): def __init__(self, system_prompt: _Optional[str] = ..., tools: _Optional[_Iterable[str]] = ..., edges: _Optional[_Iterable[str]] = ..., model: _Optional[str] = ..., temperature: _Optional[float] = ..., max_tokens: _Optional[int] = ..., voice_id: _Optional[str] = ..., greeting: _Optional[str] = ...) -> None: ... class AgentGraphDef(_message.Message): - __slots__ = ("entry", "nodes", "tools", "language", "timezone", "voice_id", "tts_provider", "tts_model", "recording", "config_schema_version") + __slots__ = ("entry", "nodes", "tools", "language", "timezone", "voice_id", "tts_provider", "tts_model", "recording", "config_schema_version", "gemini_live_model") class NodesEntry(_message.Message): __slots__ = ("key", "value") KEY_FIELD_NUMBER: _ClassVar[int] @@ -120,6 +123,7 @@ class AgentGraphDef(_message.Message): TTS_MODEL_FIELD_NUMBER: _ClassVar[int] RECORDING_FIELD_NUMBER: _ClassVar[int] CONFIG_SCHEMA_VERSION_FIELD_NUMBER: _ClassVar[int] + GEMINI_LIVE_MODEL_FIELD_NUMBER: _ClassVar[int] entry: str nodes: _containers.MessageMap[str, NodeDef] tools: _containers.MessageMap[str, ToolDef] @@ -130,4 +134,5 @@ class AgentGraphDef(_message.Message): tts_model: str recording: RecordingConfig config_schema_version: str - def __init__(self, entry: _Optional[str] = ..., nodes: _Optional[_Mapping[str, NodeDef]] = ..., tools: _Optional[_Mapping[str, ToolDef]] = ..., language: _Optional[str] = ..., timezone: _Optional[str] = ..., voice_id: _Optional[str] = ..., tts_provider: _Optional[str] = ..., tts_model: _Optional[str] = ..., recording: _Optional[_Union[RecordingConfig, _Mapping]] = ..., config_schema_version: _Optional[str] = ...) -> None: ... + gemini_live_model: str + def __init__(self, entry: _Optional[str] = ..., nodes: _Optional[_Mapping[str, NodeDef]] = ..., tools: _Optional[_Mapping[str, ToolDef]] = ..., language: _Optional[str] = ..., timezone: _Optional[str] = ..., voice_id: _Optional[str] = ..., tts_provider: _Optional[str] = ..., tts_model: _Optional[str] = ..., recording: _Optional[_Union[RecordingConfig, _Mapping]] = ..., config_schema_version: _Optional[str] = ..., gemini_live_model: _Optional[str] = ...) -> None: ... diff --git a/studio/web/public/schemas/agent-config-v1.schema.json b/studio/web/public/schemas/agent-config-v1.schema.json index abfc4c0..a6b50ce 100644 --- a/studio/web/public/schemas/agent-config-v1.schema.json +++ b/studio/web/public/schemas/agent-config-v1.schema.json @@ -25,7 +25,92 @@ "config": { "type": "object", "description": "Agent runtime config (v3_graph).", - "additionalProperties": true + "additionalProperties": true, + "properties": { + "entry": { + "type": "string", + "minLength": 1 + }, + "nodes": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": true, + "properties": { + "system_prompt": { + "type": "string" + }, + "greeting": { + "type": "string" + }, + "tools": { + "type": "array", + "items": { + "type": "string" + } + }, + "edges": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "tools": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": true, + "properties": { + "description": { + "type": "string" + }, + "script": { + "type": "string" + }, + "params": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true, + "properties": { + "name": { + "type": "string" + }, + "type": { + "type": "string" + }, + "description": { + "type": "string" + }, + "required": { + "type": "boolean" + }, + "options": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "cancel_on_barge_in": { + "type": "boolean" + }, + "side_effect": { + "type": "boolean" + }, + "summarize_result": { + "type": "boolean", + "description": "When true, long tool output is summarized by AI instead of being truncated." + } + } + } + } + } }, "mermaid_diagram": { "type": ["string", "null"] diff --git a/studio/web/src/components/agent/agent-config-editor.tsx b/studio/web/src/components/agent/agent-config-editor.tsx index 6793c95..e7c7b26 100644 --- a/studio/web/src/components/agent/agent-config-editor.tsx +++ b/studio/web/src/components/agent/agent-config-editor.tsx @@ -36,6 +36,7 @@ import { SelectValue, } from "@/components/ui/select"; import { Spinner } from "@/components/ui/spinner"; +import { Switch } from "@/components/ui/switch"; import { cn } from "@/lib/utils"; import ReactMarkdown from "react-markdown"; import remarkGfm from "remark-gfm"; @@ -43,6 +44,7 @@ import { Dialog, DialogContent, DialogTrigger } from "@/components/ui/dialog"; import ConfigViewer from "@/components/agent/config-viewer"; import ConfigDiff from "@/components/agent/config-diff"; import ShikiCodeBlock from "@/components/ui/shiki-code-block"; +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"; import { toast } from "sonner"; // ── Language & timezone options ────────────────────────────────── @@ -180,6 +182,7 @@ interface ConfigTool { description?: string; side_effect?: boolean; script?: string; + summarize_result?: boolean; } interface FullConfig extends Record { @@ -288,9 +291,10 @@ export default function AgentConfigEditor({ }, [fields?.language, fields?.timezone, fields?.voice_id, fields?.gemini_live_model]); const patchField = useCallback( - async (payload: Record) => { + async (payload: Record) => { const primaryField = Object.keys(payload).find((k) => k !== "regenerate_greeting"); - if (!primaryField && !payload.regenerate_greeting) return; + const shouldRegen = Boolean(payload["regenerate_greeting"]); + if (!primaryField && !shouldRegen) return; const trackField = primaryField ?? "language"; const showGlobalSyncToast = !["voice_id", "language", "timezone"].includes(trackField); @@ -880,46 +884,83 @@ export default function AgentConfigEditor({ {tools.length > 0 ? ( -
+
{tools.map(([id, tool]) => (
-
-
-
- -
-
- - {id} +
+
+ +
+
+ + {id} + +
+ + {tool.side_effect ? "Write" : "Read"} -
- - {tool.side_effect ? "Write" : "Read"} - -
-
+
+
-
- - Audit -
+
+ + + +
e.stopPropagation()} + onPointerDown={(e) => e.stopPropagation()} + > + + Summarize Result + + + patchField({ + tool_summarize_overrides: { + [id]: checked || null, + }, + }) + } + /> +
+
+ + Use LLM to summarize long tool outputs + +
+ + + +
+ +
+
+ + View Tool Implementation + +
+
-

- {tool.description || "—"} + {/* Row 3: description */} +

+ {tool.description || "No description provided."}

diff --git a/studio/web/src/lib/api/agent.ts b/studio/web/src/lib/api/agent.ts index f569a2b..c51443c 100644 --- a/studio/web/src/lib/api/agent.ts +++ b/studio/web/src/lib/api/agent.ts @@ -1,9 +1,11 @@ // Code generated by protoc-gen-ts_proto. DO NOT EDIT. // versions: // protoc-gen-ts_proto v2.11.6 -// protoc v4.25.0 +// protoc v6.31.1 // source: agent.proto +/* eslint-disable */ + export const protobufPackage = "agent"; export enum AudioLayout { @@ -53,6 +55,11 @@ export interface ToolDef { cancel_on_barge_in: boolean; /** If true, this tool has side effects */ side_effect: boolean; + /** + * When true, the runtime passes the result through an LLM summarizer + * before feeding it back. Adds ~5-10s latency. Default false = truncate. + */ + summarize_result: boolean; } /** A single node in the graph */ @@ -64,7 +71,9 @@ export interface NodeDef { edges: string[]; model?: string | undefined; temperature?: number | undefined; - max_tokens?: number | undefined; + max_tokens?: + | number + | undefined; /** TTS voice override */ voice_id?: string | undefined; greeting?: string | undefined; @@ -79,20 +88,38 @@ export interface AgentGraphDef { /** All tool definitions keyed by tool name */ tools: { [key: string]: ToolDef }; /** -- Agent-wide settings -- */ - language?: string | undefined; + language?: + | string + | undefined; /** IANA timezone */ - timezone?: string | undefined; + timezone?: + | string + | undefined; /** Default TTS voice ID */ - voice_id?: string | undefined; + voice_id?: + | string + | undefined; /** e.g. "elevenlabs" */ - tts_provider?: string | undefined; + tts_provider?: + | string + | undefined; /** e.g. "eleven_turbo_v2" */ - tts_model?: string | undefined; + tts_model?: + | string + | undefined; /** Session recording configuration */ - recording?: RecordingConfig | undefined; + recording?: + | RecordingConfig + | undefined; /** Envelope field for versions (e.g. "v3_graph") */ - config_schema_version?: string | undefined; - gemini_live_api_key?: string | undefined; + config_schema_version?: + | string + | undefined; + /** + * -- Native multimodal (Gemini Live) -- + * When set, the session bypasses STT/LLM/TTS and uses Gemini Live's native + * bidirectional audio-to-audio WebSocket for the entire conversation. + */ gemini_live_model?: string | undefined; } diff --git a/studio/web/src/lib/api/client.ts b/studio/web/src/lib/api/client.ts index 1031ea2..411cbe6 100644 --- a/studio/web/src/lib/api/client.ts +++ b/studio/web/src/lib/api/client.ts @@ -593,6 +593,7 @@ export const api = { voice_id?: string; tts_provider?: string; tts_model?: string; + tool_summarize_overrides?: Record; regenerate_greeting?: boolean; } ) => diff --git a/voice/engine/crates/agent-kit/src/agent_backends/default.rs b/voice/engine/crates/agent-kit/src/agent_backends/default.rs index 7316cab..7cea969 100644 --- a/voice/engine/crates/agent-kit/src/agent_backends/default.rs +++ b/voice/engine/crates/agent-kit/src/agent_backends/default.rs @@ -13,17 +13,19 @@ use tokio::sync::mpsc; use tracing::{info, warn}; use uuid::Uuid; +use crate::agent_backends::ChatMessage; use crate::agent_backends::{AgentBackend, AgentBackendConfig, AgentEvent, ToolInterceptor}; use crate::context_summarizer::{trim_history, ContextSummarizationConfig, ContextSummarizer}; use crate::micro_tasks; use crate::providers::{LlmCallConfig, LlmProvider, LlmProviderError}; +use crate::providers::{LlmEvent as InnerLlmEvent, ToolCallEvent}; use crate::swarm::{ build_node_tool_schemas, make_artifact_tool_schemas, make_hang_up_tool_schema, make_on_hold_tool_schema, AgentGraphDef, SwarmState, HANG_UP_TOOL_NAME, ON_HOLD_TOOL_NAME, }; -use crate::tool_executor::{spawn_tool_task, ToolTaskResult}; -use crate::agent_backends::ChatMessage; -use crate::providers::{LlmEvent as InnerLlmEvent, ToolCallEvent}; +use crate::tool_executor::{ + spawn_tool_task, ToolTaskResult, +}; use crate::ScriptEngine; // ── Runtime system prompt suffix ──────────────────────────────── @@ -219,9 +221,6 @@ pub struct DefaultAgentBackend { /// Optional interceptor for intercepting tool calls (testing, observability). interceptor: Option>, - /// Optional async summarizer for tool results before feeding to LLM. - tool_result_transformer: Option, - // ── Context summarization ── /// Optional context summarizer for background conversation compression. context_summarizer: Option, @@ -305,16 +304,6 @@ impl DefaultAgentBackend { // Flags are set by the calling binary (voice-engine reads env vars via // envy and populates AgentBackendConfig directly). - let tool_transformer: Option = if config.tool_summarizer - { - Some(micro_tasks::ToolResultSummarizer::new( - Arc::clone(&provider), - 500, // min chars before summarization kicks in - )) - } else { - None - }; - let ctx_summarizer: Option = if config.context_summarizer { Some(ContextSummarizer::new(Arc::clone(&provider))) } else { @@ -325,7 +314,6 @@ impl DefaultAgentBackend { provider, script_engine, interceptor: None, - tool_result_transformer: tool_transformer, context_summarizer: ctx_summarizer, context_summarization_config: ContextSummarizationConfig::default(), filler_task: None, @@ -566,7 +554,25 @@ impl DefaultAgentBackend { /// If a `ToolInterceptor` is set, it is consulted before and after execution: /// - `before_tool_call` can return `Stub(result)` to skip execution entirely. /// - `after_tool_call` can return `Override(result)` to replace the real result. - fn spawn_tool(&mut self, call_id: String, name: String, args: String, side_effect: bool) { + fn spawn_tool( + &mut self, + call_id: String, + name: String, + args: String, + side_effect: bool, + summarize: bool, + ) { + // Some streaming providers can emit duplicate tool-call events for the same + // call_id (e.g. retry/delta edge cases). Guard against double-counting, which + // would leave `tools_remaining` stuck > 0 forever. + if self.pending_tool_info.contains_key(&call_id) { + warn!( + "[agent_backend] duplicate tool call id ignored: {} ({})", + call_id, name + ); + return; + } + // Spawn filler generator for side-effecting tools only when enabled. // We ensure only one filler task runs per wait-batch by checking `.is_none()`. if side_effect && self.config.tool_filler && self.filler_task.is_none() { @@ -589,16 +595,27 @@ impl DefaultAgentBackend { })); } + let before = self.tools_remaining; self.pending_tool_info.insert(call_id.clone(), name.clone()); self.tools_remaining += 1; + info!( + "[agent_backend] spawn_tool: id={} name={} tools_remaining {}->{} pending_info={}", + call_id, + name, + before, + self.tools_remaining, + self.pending_tool_info.len() + ); spawn_tool_task( call_id, name, args, side_effect, + summarize, self.script_engine.clone(), self.interceptor.clone(), + Some(Arc::clone(&self.provider)), self.tool_result_tx.clone(), ); } @@ -621,8 +638,14 @@ impl DefaultAgentBackend { .and_then(|v| v.get("reason").and_then(|r| r.as_str()).map(String::from)) .unwrap_or_else(|| "agent_initiated".to_string()); - info!("[agent_backend] hang_up deferred (tools_remaining={}): {}", self.tools_remaining, reason); - self.pending_hang_up = Some(PendingHangUp { reason, content: None }); + info!( + "[agent_backend] hang_up deferred (tools_remaining={}): {}", + self.tools_remaining, reason + ); + self.pending_hang_up = Some(PendingHangUp { + reason, + content: None, + }); // Do NOT touch llm_event_rx, pending_tokens, or phase here. // The stream continues; hang_up is resolved at stream-end. } @@ -715,12 +738,12 @@ impl DefaultAgentBackend { ..tc.clone() }); - let side_effect = self + let (side_effect, summarize) = self .swarm .as_ref() .and_then(|s| s.graph.tools.get(&tc.name)) - .map(|t| t.side_effect) - .unwrap_or(false); + .map(|t| (t.side_effect, t.summarize_result)) + .unwrap_or((false, false)); if side_effect { tracing::debug!("[agent_backend] Tool '{}' marked as side-effect", tc.name); @@ -733,6 +756,7 @@ impl DefaultAgentBackend { tc.name.clone(), tc.arguments.clone(), side_effect, + summarize, ); return Some(AgentEvent::ToolCallStarted { @@ -763,8 +787,13 @@ impl DefaultAgentBackend { self.phase = Phase::Idle; // Pending hang_up takes priority over Finished. if let Some(ph) = self.pending_hang_up.take() { - info!("[agent_backend] hang_up resolved at stream-end (no pending tools)"); - return Some(AgentEvent::HangUp { reason: ph.reason, content: ph.content }); + info!( + "[agent_backend] hang_up resolved at stream-end (no pending tools)" + ); + return Some(AgentEvent::HangUp { + reason: ph.reason, + content: ph.content, + }); } // Normal turn completion. if let Some(ctx) = self.context_summarizer.as_mut() { @@ -784,6 +813,10 @@ impl DefaultAgentBackend { } async fn handle_waiting_for_tools_phase(&mut self) -> Option { + tracing::debug!( + "[agent_backend] WaitingForTools: pending={} waiting for next tool result", + self.tools_remaining + ); let rx = &mut self.tool_result_rx; let result = if let Some(mut filler_task) = self.filler_task.take() { tokio::select! { @@ -806,15 +839,26 @@ impl DefaultAgentBackend { rx.recv().await? }; - // Remove from pending info - self.pending_tool_info.remove(&result.call_id); + info!( + "[agent_backend] WaitingForTools: received tool result id={} name={} success={}", + result.call_id, result.name, result.success + ); - // Apply tool-result summarization if enabled - let content = if let Some(ref transformer) = self.tool_result_transformer { - transformer.transform(&result.name, &result.result).await - } else { - result.result - }; + // Remove from pending info (best-effort). + let removed = self.pending_tool_info.remove(&result.call_id).is_some(); + if !removed { + warn!( + "[agent_backend] received tool result for unknown call_id={} (name={})", + result.call_id, result.name + ); + } + + // IMPORTANT: + // Do not await additional async work (e.g. tool summarizer) here. + // This phase runs inside a recv loop that can be cancelled/repolled by + // the reactor select loop; keeping this path await-free ensures + // `tools_remaining` accounting is atomic once a result is received. + let content = result.result.clone(); let error_msg = (!result.success).then(|| content.clone()); @@ -826,7 +870,23 @@ impl DefaultAgentBackend { tool_call_id: Some(result.call_id.clone()), }); - self.tools_remaining -= 1; + let before = self.tools_remaining; + if self.tools_remaining == 0 { + warn!( + "[agent_backend] tools_remaining already zero when result arrived: id={} name={}", + result.call_id, result.name + ); + } else { + self.tools_remaining -= 1; + } + info!( + "[agent_backend] tool_result_accounting: id={} removed={} tools_remaining {}->{} pending_info={}", + result.call_id, + removed, + before, + self.tools_remaining, + self.pending_tool_info.len() + ); let event = AgentEvent::ToolCallCompleted { id: result.call_id, @@ -888,7 +948,10 @@ impl AgentBackend for DefaultAgentBackend { let tz = self.timezone(); self.conversation = vec![ChatMessage { role: "system".to_string(), - content: Some(serde_json::Value::String(with_suffix(&prompt, tz.as_deref()))), + content: Some(serde_json::Value::String(with_suffix( + &prompt, + tz.as_deref(), + ))), tool_calls: None, tool_call_id: None, }]; @@ -914,6 +977,11 @@ impl AgentBackend for DefaultAgentBackend { } async fn start_turn(&mut self) -> Result<(), LlmProviderError> { + info!( + "[agent_backend] start_turn: reset counters (prev tools_remaining={} pending_info={})", + self.tools_remaining, + self.pending_tool_info.len() + ); self.tool_rounds = 0; self.tools_remaining = 0; self.pending_tool_info.clear(); @@ -971,7 +1039,9 @@ impl AgentBackend for DefaultAgentBackend { for (call_id, _name) in std::mem::take(&mut self.pending_tool_info) { self.conversation.push(ChatMessage { role: "tool".to_string(), - content: Some(serde_json::Value::String("Tool execution was interrupted by the user.".to_string())), + content: Some(serde_json::Value::String( + "Tool execution was interrupted by the user.".to_string(), + )), tool_calls: None, tool_call_id: Some(call_id), }); @@ -1004,7 +1074,10 @@ impl AgentBackend for DefaultAgentBackend { if let Some(first_msg) = self.conversation.first_mut() { if first_msg.role == "system" { let tz = swarm.graph.timezone.as_deref(); - first_msg.content = Some(serde_json::Value::String(with_suffix(&node.system_prompt, tz))); + first_msg.content = Some(serde_json::Value::String(with_suffix( + &node.system_prompt, + tz, + ))); } } } diff --git a/voice/engine/crates/agent-kit/src/agent_backends/mod.rs b/voice/engine/crates/agent-kit/src/agent_backends/mod.rs index bf2855a..5331d80 100644 --- a/voice/engine/crates/agent-kit/src/agent_backends/mod.rs +++ b/voice/engine/crates/agent-kit/src/agent_backends/mod.rs @@ -204,8 +204,6 @@ pub struct AgentBackendConfig { /// swap in fresh credentials mid-session, while QuickJS `secret()` reads /// always see the latest value via a non-blocking read lock. pub secrets: SharedSecretMap, - /// Summarize long tool results before feeding them to the main LLM. - pub tool_summarizer: bool, /// Compress conversation history when it grows too long. pub context_summarizer: bool, /// Speak a brief filler phrase while side-effecting tools run. @@ -226,7 +224,6 @@ impl std::fmt::Debug for AgentBackendConfig { self.secrets.read().map(|s| s.len()).unwrap_or(0) ), ) - .field("tool_summarizer", &self.tool_summarizer) .field("context_summarizer", &self.context_summarizer) .field("tool_filler", &self.tool_filler) .finish() @@ -240,7 +237,6 @@ impl Default for AgentBackendConfig { max_tokens: 32768, max_tool_rounds: 5, secrets: std::sync::Arc::new(std::sync::RwLock::new(SecretMap::new())), - tool_summarizer: false, context_summarizer: false, tool_filler: false, } diff --git a/voice/engine/crates/agent-kit/src/micro_tasks.rs b/voice/engine/crates/agent-kit/src/micro_tasks.rs index 87358f6..288b263 100644 --- a/voice/engine/crates/agent-kit/src/micro_tasks.rs +++ b/voice/engine/crates/agent-kit/src/micro_tasks.rs @@ -1,16 +1,14 @@ //! Internal LLM micro-call tasks managed directly by `DefaultAgentBackend`. //! //! These tasks run in the background to augment agentic logic: -//! bridging silence with filler words, and compressing tool logs. +//! bridging silence with filler words and summarizing tool output. //! They are fully internal — no public traits, no external customization points. -use std::sync::Arc; +use std::time::Duration; use tracing::{info, warn}; -use crate::providers::{collect_text, LlmCallConfig, LlmProvider}; use crate::agent_backends::ChatMessage; - -// ── Tool Summarizer ───────────────────────────────────────────────── +use crate::providers::{collect_text, LlmCallConfig, LlmProvider}; const TOOL_SUMMARY_PROMPT: &str = "\ You are a tool result summarizer for a voice assistant. Condense \ @@ -19,69 +17,76 @@ captures the key information the voice assistant needs to respond \ to the user. Keep only the facts that matter for the conversation.\n\n\ Output ONLY the summary. No explanation, no formatting."; -#[derive(Clone)] -pub(super) struct ToolResultSummarizer { - provider: Arc, - summary_min_length: usize, -} +const TOOL_SUMMARY_TIMEOUT: Duration = Duration::from_secs(8); +const TOOL_FILLER_TIMEOUT: Duration = Duration::from_secs(4); -impl ToolResultSummarizer { - pub(super) fn new(provider: Arc, summary_min_length: usize) -> Self { - Self { - provider, - summary_min_length, - } +pub(super) async fn summarize_tool_result( + provider: &dyn LlmProvider, + tool_name: &str, + raw_result: &str, + summary_min_length: usize, +) -> String { + if raw_result.len() < summary_min_length { + return raw_result.to_string(); } - pub(super) async fn transform(&self, tool_name: &str, raw_result: &str) -> String { - if raw_result.len() < self.summary_min_length { - return raw_result.to_string(); - } - let messages = vec![ - ChatMessage { - role: "system".to_string(), - content: Some(serde_json::Value::String(TOOL_SUMMARY_PROMPT.to_string())), - tool_calls: None, - tool_call_id: None, - }, - ChatMessage { - role: "user".to_string(), - content: Some(serde_json::Value::String(format!( - "Tool: {}\n\nRaw output:\n{}", - tool_name, raw_result - ))), - tool_calls: None, - tool_call_id: None, - }, - ]; - let config = LlmCallConfig { - temperature: 0.0, - max_tokens: 200, - model: None, - }; - match collect_text(&*self.provider, &messages, &config).await { - Ok(text) => { - let trimmed = text.trim().to_string(); - if trimmed.is_empty() { - raw_result.to_string() - } else { - info!( - "[agent_backend::helpers] Tool result summarized ({}): {} → {} chars", - tool_name, - raw_result.len(), - trimmed.len() - ); - trimmed - } - } - Err(e) => { - warn!( - "[agent_backend::helpers] Tool {} summarization failed: {} — using raw result", - tool_name, e - ); + let messages = vec![ + ChatMessage { + role: "system".to_string(), + content: Some(serde_json::Value::String(TOOL_SUMMARY_PROMPT.to_string())), + tool_calls: None, + tool_call_id: None, + }, + ChatMessage { + role: "user".to_string(), + content: Some(serde_json::Value::String(format!( + "Tool: {}\n\nRaw output:\n{}", + tool_name, raw_result + ))), + tool_calls: None, + tool_call_id: None, + }, + ]; + let config = LlmCallConfig { + temperature: 0.0, + max_tokens: 200, + model: None, + }; + + match tokio::time::timeout( + TOOL_SUMMARY_TIMEOUT, + collect_text(provider, &messages, &config), + ) + .await + { + Ok(Ok(text)) => { + let trimmed = text.trim().to_string(); + if trimmed.is_empty() { raw_result.to_string() + } else { + info!( + "[agent_backend::helpers] Tool result summarized ({}): {} -> {} chars", + tool_name, + raw_result.len(), + trimmed.len() + ); + trimmed } } + Ok(Err(e)) => { + warn!( + "[agent_backend::helpers] Tool {} summarization failed: {} - using raw result", + tool_name, e + ); + raw_result.to_string() + } + Err(_) => { + warn!( + "[agent_backend::helpers] Tool {} summarization timed out after {:?} - using raw result", + tool_name, TOOL_SUMMARY_TIMEOUT + ); + raw_result.to_string() + } } } @@ -122,8 +127,13 @@ pub(super) async fn generate_tool_filler( max_tokens: 30, model: None, }; - match collect_text(provider, &messages, &config).await { - Ok(text) => { + match tokio::time::timeout( + TOOL_FILLER_TIMEOUT, + collect_text(provider, &messages, &config), + ) + .await + { + Ok(Ok(text)) => { let trimmed = text.trim().to_string(); if trimmed.is_empty() { None @@ -135,9 +145,16 @@ pub(super) async fn generate_tool_filler( Some(trimmed) } } - Err(e) => { + Ok(Err(e)) => { warn!("[agent_backend::helpers] Tool filler failed: {}", e); None } + Err(_) => { + warn!( + "[agent_backend::helpers] Tool filler timed out after {:?}", + TOOL_FILLER_TIMEOUT + ); + None + } } } diff --git a/voice/engine/crates/agent-kit/src/quickjs_engine.rs b/voice/engine/crates/agent-kit/src/quickjs_engine.rs index 00cac64..8013f7a 100644 --- a/voice/engine/crates/agent-kit/src/quickjs_engine.rs +++ b/voice/engine/crates/agent-kit/src/quickjs_engine.rs @@ -685,6 +685,7 @@ mod tests { params, cancel_on_barge_in: true, side_effect: false, + summarize_result: false, } } diff --git a/voice/engine/crates/agent-kit/src/swarm.rs b/voice/engine/crates/agent-kit/src/swarm.rs index b03bab6..50f7cbc 100644 --- a/voice/engine/crates/agent-kit/src/swarm.rs +++ b/voice/engine/crates/agent-kit/src/swarm.rs @@ -17,7 +17,6 @@ use std::collections::HashMap; - use serde_json::json; // ── Graph Definition ──────────────────────────────────────────── @@ -48,7 +47,7 @@ use serde_json::json; // Re-export canonical recording type definitions from common. pub use common::{AudioFormat, AudioLayout, RecordingConfig}; -pub use proto::agent::{AgentGraphDef, NodeDef, ToolDef, ParamDef}; +pub use proto::agent::{AgentGraphDef, NodeDef, ParamDef, ToolDef}; // ── Runtime State ─────────────────────────────────────────────── @@ -460,8 +459,7 @@ mod tests { #[test] fn node_tool_schemas_include_transfer() { let graph = sample_graph(); - let schemas = - build_node_tool_schemas(&graph.nodes["receptionist"], &graph.tools); + let schemas = build_node_tool_schemas(&graph.nodes["receptionist"], &graph.tools); // Base tools + transfer_to + hang_up + on_hold + artifacts assert!( diff --git a/voice/engine/crates/agent-kit/src/tool_executor.rs b/voice/engine/crates/agent-kit/src/tool_executor.rs index 01e4de1..c842797 100644 --- a/voice/engine/crates/agent-kit/src/tool_executor.rs +++ b/voice/engine/crates/agent-kit/src/tool_executor.rs @@ -3,6 +3,8 @@ use tokio::sync::mpsc; use tracing::{info, warn}; use crate::agent_backends::{AfterToolCallAction, BeforeToolCallAction, ToolInterceptor}; +use crate::micro_tasks; +use crate::providers::LlmProvider; use crate::ScriptEngine; // ── Types ─────────────────────────────────────────────────────── @@ -82,6 +84,22 @@ pub(super) struct ToolTaskResult { pub result: String, } +const TOOL_SUMMARY_MIN_LENGTH: usize = 500; +const TOOL_RESULT_HARD_CAP_CHARS: usize = 8000; + +fn cap_tool_result(result: &str, max_chars: usize) -> String { + let char_count = result.chars().count(); + if char_count <= max_chars { + return result.to_string(); + } + let truncated: String = result.chars().take(max_chars).collect(); + format!( + "{}\n\n[tool result truncated: {} chars omitted]", + truncated, + char_count.saturating_sub(max_chars) + ) +} + // ── Pipeline ──────────────────────────────────────────────────── /// Spawns a background task to execute a tool call, routing through hooks and engines. @@ -92,8 +110,10 @@ pub(super) fn spawn_tool_task( name: String, args: String, side_effect: bool, + summarize: bool, script_engine_opt: Option>, interceptor_opt: Option>, + summary_provider_opt: Option>, tx: mpsc::UnboundedSender, ) { tokio::spawn(async move { @@ -181,8 +201,12 @@ pub(super) fn spawn_tool_task( } }; - // Execution Timeout limit (25 seconds) - let result = + // Execution timeout: 25 seconds. + // If post-processing (summarization) is enabled, add up to + // TOOL_SUMMARY_TIMEOUT (8 s) for a worst-case total of ~33 s. + // Both limits are enforced inside this detached tokio::spawn, + // so neither blocks the reactor's select! loop directly. + let mut result = match tokio::time::timeout(std::time::Duration::from_secs(25), result_fut).await { Ok(r) => r, Err(_) => { @@ -192,6 +216,42 @@ pub(super) fn spawn_tool_task( } }; + if result.success { + if summarize { + if let Some(provider) = summary_provider_opt.as_deref() { + result.result = micro_tasks::summarize_tool_result( + provider, + &task_name, + &result.result, + TOOL_SUMMARY_MIN_LENGTH, + ) + .await; + } + } + + let capped = cap_tool_result(&result.result, TOOL_RESULT_HARD_CAP_CHARS); + if capped.len() != result.result.len() { + info!( + tool.name = %task_name, + tool.call_id = %call_id, + tool.before_chars = result.result.len(), + tool.after_chars = capped.len(), + tool.summarize = summarize, + "[agent_backend] Tool result capped before enqueue" + ); + } + result.result = capped; + } + + info!( + tool.name = %task_name, + tool.call_id = %call_id, + tool.side_effect = side_effect, + tool.success = result.success, + tool.result_chars = result.result.len(), + "[agent_backend] Tool task finished; sending result to backend channel" + ); + if tx .send(ToolTaskResult { call_id: call_id.clone(), @@ -209,6 +269,12 @@ pub(super) fn spawn_tool_task( tool.result_chars = result.result.len(), "[agent_backend] Tool completed after session ended (result orphaned)" ); + } else { + info!( + tool.name = %task_name, + tool.call_id = %call_id, + "[agent_backend] Tool result sent to backend channel" + ); } }); } @@ -293,9 +359,23 @@ mod tests { #[test] fn classify_whitespace_trimmed_before_parsing() { - let out = - ToolOutcome::classify_script_result(" {\"result\": \"trimmed\"} ".to_string()); + let out = ToolOutcome::classify_script_result(" {\"result\": \"trimmed\"} ".to_string()); assert!(out.success); assert_eq!(out.result, "trimmed"); } + + #[test] + fn cap_tool_result_exact_boundary() { + let input = "a".repeat(8000); + let output = cap_tool_result(&input, 8000); + assert_eq!(output, input); // no truncation notice + } + + #[test] + fn cap_tool_result_exceeds_boundary() { + let input = "a".repeat(8001); + let output = cap_tool_result(&input, 8000); + assert!(output.starts_with(&"a".repeat(8000))); + assert!(output.ends_with("[tool result truncated: 1 chars omitted]")); + } } diff --git a/voice/engine/src/reactor/mod.rs b/voice/engine/src/reactor/mod.rs index 4fb1214..81513ae 100644 --- a/voice/engine/src/reactor/mod.rs +++ b/voice/engine/src/reactor/mod.rs @@ -611,17 +611,32 @@ impl Reactor { break; } + // LLM events are polled first (biased select arm #1) to prevent + // rapid RTP ingress from repeatedly cancelling `llm.recv()` before + // a ready tool-result event is consumed. + // + // Trade-off: if the LLM channel is continuously ready (fast token + // stream), audio_rx could be transiently starved. This is acceptable + // because: (a) LLM streams are bounded in duration, (b) VAD and the + // denoiser run on the audio thread and do not block on this recv(), + // and (c) a brief delay in on_audio() during token streaming has no + // perceptible impact on voice quality or latency. + // + // replay_log.record(ReactorInput::*) arms below capture a typed + // input snapshot (opt-in, zero-cost when disabled) used by the + // replay/sim harness for deterministic testing. Each arm decides + // independently whether to record. See reactor/replay.rs. tokio::select! { biased; - // ── Highest priority: incoming audio (keep VAD responsive) ── - // - // replay_log.record(ReactorInput::*) - // → typed input snapshot (opt-in, zero-cost when disabled). - // Used by the replay/sim harness for deterministic testing. - // - // Each arm decides independently whether to call one or both. - // See reactor/replay.rs for the ReactorInput type documentation. + // ── LLM tokens / tool calls / finished ── + // Prioritized over audio — see rationale above. + Some(ev) = self.llm.recv(), if self.llm.is_active() => { + self.replay_log.record(replay::ReactorInput::LlmEvent(ev.clone())); + self.on_llm_event(ev).await; + } + + // ── Audio (keep VAD responsive) ── msg = self.audio_rx.recv() => { match msg { Some(raw) => { @@ -647,12 +662,6 @@ impl Reactor { self.on_stt_event(ev).await; } - // ── LLM tokens / tool calls / finished ── - Some(ev) = self.llm.recv(), if self.llm.is_active() => { - self.replay_log.record(replay::ReactorInput::LlmEvent(ev.clone())); - self.on_llm_event(ev).await; - } - // ── TTS audio chunks ── Some(ev) = self.tts.recv(), if self.tts.is_active() => { // Record non-audio events only — audio chunks are large and diff --git a/voice/engine/src/session.rs b/voice/engine/src/session.rs index 842acff..3143119 100644 --- a/voice/engine/src/session.rs +++ b/voice/engine/src/session.rs @@ -227,7 +227,6 @@ impl VoiceSession { max_tokens: config.max_tokens, max_tool_rounds: 5, secrets, - tool_summarizer: task.agent_tool_summarizer, context_summarizer: task.agent_context_summarizer, tool_filler: task.agent_tool_filler, }; @@ -277,7 +276,6 @@ impl VoiceSession { max_tokens: config.max_tokens, max_tool_rounds: 5, secrets, - tool_summarizer: task.agent_tool_summarizer, context_summarizer: task.agent_context_summarizer, tool_filler: task.agent_tool_filler, }; diff --git a/voice/engine/src/settings.rs b/voice/engine/src/settings.rs index a0958a5..fbaa84a 100644 --- a/voice/engine/src/settings.rs +++ b/voice/engine/src/settings.rs @@ -48,14 +48,10 @@ use serde::Deserialize; /// /// | Variable | Default | Description | /// |---|---|---| -/// | `AGENT__TOOL_SUMMARIZER` | `true` | Summarize long tool results before feeding to LLM | /// | `AGENT__CONTEXT_SUMMARIZER` | `true` | Compress conversation history when it grows long | /// | `AGENT__TOOL_FILLER` | `false` | Speak a filler phrase while side-effecting tools run | #[derive(Debug, Clone, Deserialize)] pub struct AgentTaskSettings { - #[serde(rename = "agent__tool_summarizer", default = "default_true")] - pub agent_tool_summarizer: bool, - #[serde(rename = "agent__context_summarizer", default = "default_true")] pub agent_context_summarizer: bool, @@ -98,7 +94,6 @@ impl AgentTaskSettings { impl Default for AgentTaskSettings { fn default() -> Self { Self { - agent_tool_summarizer: true, agent_context_summarizer: true, agent_tool_filler: false, }