diff --git a/apps/mesh/src/api/app.ts b/apps/mesh/src/api/app.ts index f394d6ce56..c38c3387db 100644 --- a/apps/mesh/src/api/app.ts +++ b/apps/mesh/src/api/app.ts @@ -36,6 +36,7 @@ import oauthProxyRoutes, { import openaiCompatRoutes from "./routes/openai-compat"; import proxyRoutes from "./routes/proxy"; import publicConfigRoutes from "./routes/public-config"; +import transcribeRoutes from "./routes/transcribe"; import selfRoutes from "./routes/self"; import { shouldSkipMeshContext, SYSTEM_PATHS } from "./utils/paths"; @@ -588,6 +589,9 @@ export function createApp(options: CreateAppOptions = {}) { // OpenAI-compatible LLM API routes app.route("/api", openaiCompatRoutes); + // Audio transcription routes + app.route("/api", transcribeRoutes); + // Public Events endpoint app.post("/org/:organizationId/events/:type", async (c) => { const orgId = c.req.param("organizationId"); diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts new file mode 100644 index 0000000000..f1dc004bf8 --- /dev/null +++ b/apps/mesh/src/api/routes/transcribe.ts @@ -0,0 +1,291 @@ +/** + * Transcription API Route + * + * Provides audio transcription functionality by: + * 1. Receiving audio via FormData (blob) or URL + * 2. Finding a connection with TRANSCRIPTION_BINDING + * 3. Converting audio blob to base64 (passed via 'audio' field) or using URL directly + * 4. Calling TRANSCRIBE_AUDIO and returning the result + */ + +import { + TranscriptionBinding, + TRANSCRIPTION_BINDING, + SUPPORTED_AUDIO_FORMATS, + connectionImplementsBinding, + type Binder, +} from "@decocms/bindings"; +import { Hono } from "hono"; +import { lookup } from "node:dns/promises"; +import type { MeshContext } from "../../core/mesh-context"; +import type { ConnectionEntity } from "../../tools/connection/schema"; + +type Variables = { + meshContext: MeshContext; +}; + +const app = new Hono<{ Variables: Variables }>(); + +const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB + +/** + * Check if an IP address is private/internal + */ +function isPrivateIp(ip: string): boolean { + // IPv4 check + const ipv4Match = ip.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/); + if (ipv4Match) { + const [, a, b] = ipv4Match.map(Number); + return ( + a === 10 || // 10.0.0.0/8 + a === 127 || // 127.0.0.0/8 (loopback) + (a === 172 && b && b >= 16 && b <= 31) || // 172.16.0.0/12 + (a === 192 && b === 168) || // 192.168.0.0/16 + (a === 169 && b === 254) || // 169.254.0.0/16 (link-local, AWS metadata) + a === 0 // 0.0.0.0/8 + ); + } + + // IPv6 check + const ipLower = ip.toLowerCase(); + const ipv4MappedMatch = ipLower.match(/^::ffff:(\d{1,3}(?:\.\d{1,3}){3})$/); + if (ipv4MappedMatch && isPrivateIp(ipv4MappedMatch[1] ?? "")) { + return true; + } + if ( + ipLower === "::1" || // loopback + ipLower.startsWith("fe80:") || // link-local + ipLower.startsWith("fc") || // unique local (fc00::/7) + ipLower.startsWith("fd") // unique local (fc00::/7) + ) { + return true; + } + + return false; +} + +/** + * Validate audioUrl to prevent SSRF attacks + * Checks URL format, scheme, and resolves DNS to verify IPs are public + */ +async function validateAudioUrl( + urlString: string, +): Promise<{ valid: true } | { valid: false; error: string }> { + let url: URL; + try { + url = new URL(urlString); + } catch { + return { valid: false, error: "Invalid URL format" }; + } + + // Only allow HTTP/HTTPS schemes + if (url.protocol !== "http:" && url.protocol !== "https:") { + return { valid: false, error: "Only HTTP and HTTPS URLs are allowed" }; + } + + const hostname = url.hostname.toLowerCase(); + + // Block localhost and loopback addresses (string check) + if (hostname === "localhost" || hostname === "[::1]") { + return { valid: false, error: "Localhost URLs are not allowed" }; + } + + // If hostname is already an IP, check it directly + if (isPrivateIp(hostname)) { + return { + valid: false, + error: "Private or internal IP addresses are not allowed", + }; + } + + // Resolve DNS and check all returned IPs to prevent DNS rebinding + try { + const results = await lookup(hostname, { all: true }); + for (const { address } of results) { + if (isPrivateIp(address)) { + return { + valid: false, + error: "URL resolves to a private or internal IP address", + }; + } + } + } catch { + return { valid: false, error: "Failed to resolve hostname" }; + } + + return { valid: true }; +} + +/** + * Find a connection that implements a specific binding + */ +async function findConnectionWithBinding( + ctx: MeshContext, + organizationId: string, + binding: Binder, +): Promise { + const connections = await ctx.storage.connections.list(organizationId); + return ( + connections.find( + (conn) => + conn.status === "active" && connectionImplementsBinding(conn, binding), + ) ?? null + ); +} + +/** + * Convert a Blob to base64 string + */ +async function blobToBase64(blob: Blob): Promise { + const arrayBuffer = await blob.arrayBuffer(); + return Buffer.from(arrayBuffer).toString("base64"); +} + +/** + * POST /:org/transcribe + * + * Transcribe audio to text using available transcription service. + * + * Request: FormData with: + * - audio: Blob (audio file) + * - audioUrl: string (optional, URL to audio file) + * - language: string (optional, ISO 639-1 language code) + * + * Response: { text, language, duration, confidence } + */ +app.post("/:org/transcribe", async (c) => { + const ctx = c.get("meshContext"); + const orgSlug = c.req.param("org"); + + // 1. Validate auth + if (!ctx.auth.user?.id && !ctx.auth.apiKey?.id) { + return c.json({ error: "Authentication required" }, 401); + } + + // 2. Validate organization + if (!ctx.organization) { + return c.json({ error: "Organization context required" }, 400); + } + + if (ctx.organization.slug !== orgSlug && ctx.organization.id !== orgSlug) { + return c.json({ error: "Organization mismatch" }, 403); + } + + const organizationId = ctx.organization.id; + + // 3. Parse FormData + let formData: FormData; + try { + formData = await c.req.formData(); + } catch { + return c.json({ error: "Invalid form data" }, 400); + } + + const audioFile = formData.get("audio") as File | null; + const audioUrl = formData.get("audioUrl") as string | null; + const language = formData.get("language") as string | null; + + if (!audioFile && !audioUrl) { + return c.json({ error: "Either audio file or audioUrl is required" }, 400); + } + + // 4. Validate audioUrl if provided (prevent SSRF) + if (audioUrl) { + const urlValidation = await validateAudioUrl(audioUrl); + if (!urlValidation.valid) { + return c.json({ error: urlValidation.error }, 400); + } + } + + // 5. Validate file size and format (if file provided) + if (audioFile) { + if (audioFile.size > MAX_FILE_SIZE) { + return c.json( + { + error: `File too large. Maximum size is ${MAX_FILE_SIZE / 1024 / 1024}MB`, + }, + 400, + ); + } + + const mimeType = audioFile.type.split(";")[0]; + if ( + !SUPPORTED_AUDIO_FORMATS.includes( + mimeType as (typeof SUPPORTED_AUDIO_FORMATS)[number], + ) + ) { + return c.json( + { + error: `Unsupported audio format: ${mimeType}. Supported formats: ${SUPPORTED_AUDIO_FORMATS.join(", ")}`, + }, + 400, + ); + } + } + + // 5. Find transcription connection + const transcriptionConnection = await findConnectionWithBinding( + ctx, + organizationId, + TRANSCRIPTION_BINDING, + ); + + if (!transcriptionConnection) { + return c.json( + { + error: + "No transcription service configured. Please add a connection with transcription capabilities (e.g., OpenAI Whisper).", + }, + 400, + ); + } + + // 6. Convert audio to base64 if blob provided + let audioBase64: string | undefined; + + if (audioFile && !audioUrl) { + try { + audioBase64 = await blobToBase64(audioFile); + } catch (error) { + console.error("[transcribe] Failed to convert audio to base64:", error); + return c.json( + { + error: `Failed to process audio: ${error instanceof Error ? error.message : "Unknown error"}`, + }, + 500, + ); + } + } + + // 7. Call transcription service + try { + const proxy = await ctx.createMCPProxy(transcriptionConnection); + const transcriptionClient = TranscriptionBinding.forClient(proxy); + + const result = await transcriptionClient.TRANSCRIBE_AUDIO({ + audio: audioBase64, + audioUrl: audioUrl ?? undefined, + mimeType: audioFile?.type, + language: language ?? undefined, + }); + + // 8. Return result + return c.json({ + text: result.text, + language: result.language, + duration: result.duration, + confidence: result.confidence, + }); + } catch (error) { + console.error("[transcribe] Transcription failed:", error); + + return c.json( + { + error: `Transcription failed: ${error instanceof Error ? error.message : "Unknown error"}`, + }, + 500, + ); + } +}); + +export default app; diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx index ef3b42d9ac..bb1d9b5b4a 100644 --- a/apps/mesh/src/web/components/chat/context.tsx +++ b/apps/mesh/src/web/components/chat/context.tsx @@ -14,6 +14,7 @@ import { useProjectContext, useVirtualMCPs, SELF_MCP_ALIAS_ID, + useConnections, } from "@decocms/mesh-sdk"; import type { Client } from "@modelcontextprotocol/sdk/client/index.js"; import type { @@ -35,6 +36,7 @@ import { useReducer, } from "react"; import { toast } from "sonner"; +import { useBindingConnections } from "../../hooks/use-binding"; import { useModelConnections } from "../../hooks/collections/use-llm"; import { useThreadMessages } from "../../hooks/use-chat-store"; import { useContext as useContextHook } from "../../hooks/use-context"; @@ -123,6 +125,9 @@ interface ChatContextValue { clearChatError: () => void; finishReason: string | null; clearFinishReason: () => void; + + // Binding availability + hasTranscriptionBinding: boolean; } // ============================================================================ @@ -558,6 +563,14 @@ export function ChatProvider({ // Always fetch messages for the active thread - if it's truly new, the query returns empty const initialMessages = useThreadMessages(stateActiveThreadId); + // Binding detection for transcription feature + const allConnections = useConnections(); + const transcriptionConnections = useBindingConnections({ + connections: allConnections, + binding: "TRANSCRIPTION", + }); + const hasTranscriptionBinding = transcriptionConnections.length > 0; + // Context prompt const contextPrompt = useContextHook(storedSelectedVirtualMcpId); @@ -830,6 +843,9 @@ export function ChatProvider({ clearChatError: chat.clearError, finishReason: chatState.finishReason, clearFinishReason, + + // Binding availability + hasTranscriptionBinding, }; return {children}; diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx index c1197dcb94..d517e8a8cb 100644 --- a/apps/mesh/src/web/components/chat/input.tsx +++ b/apps/mesh/src/web/components/chat/input.tsx @@ -9,6 +9,11 @@ import { } from "@deco/ui/components/popover.tsx"; import { cn } from "@deco/ui/lib/utils.ts"; import { useNavigate } from "@tanstack/react-router"; +import { + Tooltip, + TooltipContent, + TooltipTrigger, +} from "@deco/ui/components/tooltip.tsx"; import { AlertCircle, AlertTriangle, @@ -16,11 +21,15 @@ import { ChevronDown, CpuChip02, Edit01, + Microphone01, Stop, + StopCircle, XCircle, } from "@untitledui/icons"; import type { FormEvent } from "react"; import { useEffect, useRef, useState, type MouseEvent } from "react"; +import { toast } from "sonner"; +import { useAudioRecorder } from "../../hooks/use-audio-recorder"; import { useChat } from "./context"; import { isTiptapDocEmpty } from "./tiptap/utils"; import { ChatHighlight } from "./index"; @@ -196,10 +205,32 @@ export function ChatInput() { clearChatError, finishReason, clearFinishReason, + hasTranscriptionBinding, } = useChat(); + const { org } = useProjectContext(); + const tiptapRef = useRef(null); + // Audio recording state + const { + isRecording, + startRecording, + stopRecording, + error: recordingError, + clearError: clearRecordingError, + } = useAudioRecorder({ maxDuration: 3 * 60 * 1000 }); // 3 minutes max + const [isTranscribing, setIsTranscribing] = useState(false); + + // Show toast when recording error occurs + // oxlint-disable-next-line ban-use-effect/ban-use-effect + useEffect(() => { + if (recordingError) { + toast.error(recordingError.message); + clearRecordingError(); + } + }, [recordingError, clearRecordingError]); + const canSubmit = !isStreaming && !!selectedModel && !isTiptapDocEmpty(tiptapDoc); @@ -236,6 +267,98 @@ export function ChatInput() { void sendMessage(doc); }; + const handleRecordingToggle = async () => { + if (isTranscribing) return; + + if (isRecording) { + const audioBlob = await stopRecording(); + if (!audioBlob) { + toast.error("Failed to record audio"); + return; + } + + setIsTranscribing(true); + try { + const formData = new FormData(); + formData.append("audio", audioBlob, "recording.webm"); + + const response = await fetch(`/api/${org.slug}/transcribe`, { + method: "POST", + body: formData, + credentials: "include", + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error( + (errorData as { error?: string }).error || "Transcription failed", + ); + } + + const data = (await response.json()) as { text?: string }; + if (data.text) { + // Append transcribed text to the existing input content + const transcribedText = data.text; + + // If there's existing content, append to it; otherwise create new doc + if (tiptapDoc && tiptapDoc.content && tiptapDoc.content.length > 0) { + // Clone the existing document + const newContent = [...tiptapDoc.content]; + const lastParagraphIndex = newContent.length - 1; + const lastParagraph = newContent[lastParagraphIndex]; + + // If last paragraph has content, append with a space; otherwise just add the text + if ( + lastParagraph && + lastParagraph.type === "paragraph" && + lastParagraph.content && + lastParagraph.content.length > 0 + ) { + // Append to the last paragraph with a space separator + newContent[lastParagraphIndex] = { + ...lastParagraph, + content: [ + ...lastParagraph.content, + { type: "text", text: ` ${transcribedText}` }, + ], + }; + } else { + // Last paragraph is empty, replace it with the transcribed text + newContent[lastParagraphIndex] = { + type: "paragraph", + content: [{ type: "text", text: transcribedText }], + }; + } + + setTiptapDoc({ + type: "doc" as const, + content: newContent, + }); + } else { + // No existing content, create new doc + setTiptapDoc({ + type: "doc" as const, + content: [ + { + type: "paragraph", + content: [{ type: "text", text: transcribedText }], + }, + ], + }); + } + } + } catch (err) { + toast.error( + err instanceof Error ? err.message : "Failed to transcribe audio", + ); + } finally { + setIsTranscribing(false); + } + } else { + await startRecording(); + } + }; + const color = selectedVirtualMcp ? getAgentColor(selectedVirtualMcp.id) : null; @@ -367,6 +490,67 @@ export function ChatInput() { selectedModel={selectedModel} isStreaming={isStreaming} /> + {/* Audio Recording Button - always visible, disabled if no transcription binding */} + + + {/* Wrap in span to enable tooltip on disabled button */} + + + + + {!hasTranscriptionBinding && ( + + Add a transcription MCP to enable voice input + + )} +