From ad4e4cca8f524bb5ab9aa0f3e4b52e3d5a872f50 Mon Sep 17 00:00:00 2001 From: viniciusventura29 Date: Thu, 22 Jan 2026 10:04:22 -0300 Subject: [PATCH 01/11] feat: add audio transcription functionality - Introduced a new transcription API route to handle audio-to-text conversion. - Implemented audio recording capabilities in the chat input component, allowing users to record and transcribe audio messages. - Added hooks for audio recording management and binding detection for transcription and object storage. - Updated the chat context to include binding availability for transcription services. - Enhanced the UI to show recording options based on available bindings. --- apps/mesh/src/api/app.ts | 4 + apps/mesh/src/api/routes/transcribe.ts | 360 ++++++++++++++++++ apps/mesh/src/web/components/chat/context.tsx | 23 ++ apps/mesh/src/web/components/chat/input.tsx | 143 +++++++ apps/mesh/src/web/hooks/use-audio-recorder.ts | 272 +++++++++++++ apps/mesh/src/web/hooks/use-binding.ts | 9 +- packages/bindings/src/index.ts | 16 + .../bindings/src/well-known/transcription.ts | 159 ++++++++ 8 files changed, 985 insertions(+), 1 deletion(-) create mode 100644 apps/mesh/src/api/routes/transcribe.ts create mode 100644 apps/mesh/src/web/hooks/use-audio-recorder.ts create mode 100644 packages/bindings/src/well-known/transcription.ts diff --git a/apps/mesh/src/api/app.ts b/apps/mesh/src/api/app.ts index b81fb10704..efd346ba66 100644 --- a/apps/mesh/src/api/app.ts +++ b/apps/mesh/src/api/app.ts @@ -35,6 +35,7 @@ import oauthProxyRoutes, { } from "./routes/oauth-proxy"; import proxyRoutes from "./routes/proxy"; import publicConfigRoutes from "./routes/public-config"; +import transcribeRoutes from "./routes/transcribe"; import { isDecoHostedMcp, DECO_STORE_URL, @@ -587,6 +588,9 @@ export function createApp(options: CreateAppOptions = {}) { // OpenAI-compatible LLM API routes app.route("/api", openaiCompatRoutes); + // Audio transcription routes + app.route("/api", transcribeRoutes); + // Public Events endpoint app.post("/org/:organizationId/events/:type", async (c) => { const orgId = c.req.param("organizationId"); diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts new file mode 100644 index 0000000000..4f97eeb3b4 --- /dev/null +++ b/apps/mesh/src/api/routes/transcribe.ts @@ -0,0 +1,360 @@ +/** + * Transcription API Route + * + * Provides audio transcription functionality by: + * 1. Receiving audio via FormData (blob) or URL + * 2. Finding a connection with TRANSCRIPTION_BINDING + * 3. Using OBJECT_STORAGE_BINDING for temporary upload if needed + * 4. Calling TRANSCRIBE_AUDIO and returning the result + */ + +import { + TranscriptionBinding, + TRANSCRIPTION_BINDING, + OBJECT_STORAGE_BINDING, + SUPPORTED_AUDIO_FORMATS, + connectionImplementsBinding, +} from "@decocms/bindings"; +import { Hono } from "hono"; +import type { MeshContext } from "../../core/mesh-context"; +import type { ConnectionEntity } from "../../tools/connection/schema"; + +type Variables = { + meshContext: MeshContext; +}; + +const app = new Hono<{ Variables: Variables }>(); + +const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB + +/** + * Find a connection that implements TRANSCRIPTION_BINDING + */ +async function findTranscriptionConnection( + ctx: MeshContext, + organizationId: string, +): Promise { + const connections = await ctx.storage.connections.list(organizationId); + return ( + connections.find( + (conn) => + conn.status === "active" && + connectionImplementsBinding(conn, TRANSCRIPTION_BINDING), + ) ?? null + ); +} + +/** + * Find a connection that implements OBJECT_STORAGE_BINDING + */ +async function findObjectStorageConnection( + ctx: MeshContext, + organizationId: string, +): Promise { + const connections = await ctx.storage.connections.list(organizationId); + return ( + connections.find( + (conn) => + conn.status === "active" && + connectionImplementsBinding(conn, OBJECT_STORAGE_BINDING), + ) ?? null + ); +} + +/** + * Upload audio to object storage and get a presigned URL + */ +async function uploadAudioToObjectStorage( + ctx: MeshContext, + connection: ConnectionEntity, + audioBlob: Blob, + mimeType: string, +): Promise<{ url: string; key: string }> { + const proxy = await ctx.createMCPProxy(connection); + + // Generate unique key for temporary audio file + const timestamp = Date.now(); + const randomSuffix = Math.random().toString(36).substring(2, 8); + const extension = mimeType.split("/")[1]?.split(";")[0] || "webm"; + const key = `_transcription_temp/${timestamp}-${randomSuffix}.${extension}`; + + // Get presigned URL for upload + const putResult = await proxy.client.callTool({ + name: "PUT_PRESIGNED_URL", + arguments: { + key, + contentType: mimeType, + expiresIn: 300, // 5 minutes + }, + }); + + if (putResult.isError) { + const errorText = + putResult.content + .map((c: { type: string; text?: string }) => + c.type === "text" ? c.text : "", + ) + .join("\n") || "Failed to get upload URL"; + throw new Error(errorText); + } + + // Extract URL from result + const putContent = putResult.content.find( + (c: { type: string }) => c.type === "text", + ); + if (!putContent || putContent.type !== "text") { + throw new Error("Invalid PUT_PRESIGNED_URL response"); + } + + const putData = JSON.parse((putContent as { text: string }).text) as { + url: string; + }; + + // Upload the audio blob + const uploadResponse = await fetch(putData.url, { + method: "PUT", + body: audioBlob, + headers: { + "Content-Type": mimeType, + }, + }); + + if (!uploadResponse.ok) { + throw new Error(`Failed to upload audio: ${uploadResponse.statusText}`); + } + + // Get presigned URL for reading + const getResult = await proxy.client.callTool({ + name: "GET_PRESIGNED_URL", + arguments: { + key, + expiresIn: 300, // 5 minutes + }, + }); + + if (getResult.isError) { + const errorText = + getResult.content + .map((c: { type: string; text?: string }) => + c.type === "text" ? c.text : "", + ) + .join("\n") || "Failed to get download URL"; + throw new Error(errorText); + } + + const getContent = getResult.content.find( + (c: { type: string }) => c.type === "text", + ); + if (!getContent || getContent.type !== "text") { + throw new Error("Invalid GET_PRESIGNED_URL response"); + } + + const getData = JSON.parse((getContent as { text: string }).text) as { + url: string; + }; + + return { url: getData.url, key }; +} + +/** + * Delete temporary audio file from object storage + */ +async function deleteAudioFromObjectStorage( + ctx: MeshContext, + connection: ConnectionEntity, + key: string, +): Promise { + try { + const proxy = await ctx.createMCPProxy(connection); + await proxy.client.callTool({ + name: "DELETE_OBJECT", + arguments: { key }, + }); + } catch (error) { + // Log but don't fail if cleanup fails + console.warn("[transcribe] Failed to cleanup temporary file:", key, error); + } +} + +/** + * POST /:org/transcribe + * + * Transcribe audio to text using available transcription service. + * + * Request: FormData with: + * - audio: Blob (audio file) + * - audioUrl: string (optional, URL to audio file) + * - language: string (optional, ISO 639-1 language code) + * + * Response: { text, language, duration, confidence } + */ +app.post("/:org/transcribe", async (c) => { + const ctx = c.get("meshContext"); + const orgSlug = c.req.param("org"); + + // 1. Validate auth + if (!ctx.auth.user?.id && !ctx.auth.apiKey?.id) { + return c.json({ error: "Authentication required" }, 401); + } + + // 2. Validate organization + if (!ctx.organization) { + return c.json({ error: "Organization context required" }, 400); + } + + if (ctx.organization.slug !== orgSlug && ctx.organization.id !== orgSlug) { + return c.json({ error: "Organization mismatch" }, 403); + } + + const organizationId = ctx.organization.id; + + // 3. Parse FormData + let formData: FormData; + try { + formData = await c.req.formData(); + } catch { + return c.json({ error: "Invalid form data" }, 400); + } + + const audioFile = formData.get("audio") as File | null; + const audioUrl = formData.get("audioUrl") as string | null; + const language = formData.get("language") as string | null; + + if (!audioFile && !audioUrl) { + return c.json({ error: "Either audio file or audioUrl is required" }, 400); + } + + // 4. Validate file size and format + if (audioFile) { + if (audioFile.size > MAX_FILE_SIZE) { + return c.json( + { + error: `File too large. Maximum size is ${MAX_FILE_SIZE / 1024 / 1024}MB`, + }, + 400, + ); + } + + const mimeType = audioFile.type.split(";")[0]; + if ( + !SUPPORTED_AUDIO_FORMATS.includes( + mimeType as (typeof SUPPORTED_AUDIO_FORMATS)[number], + ) + ) { + return c.json( + { + error: `Unsupported audio format: ${mimeType}. Supported formats: ${SUPPORTED_AUDIO_FORMATS.join(", ")}`, + }, + 400, + ); + } + } + + // 5. Find transcription connection + const transcriptionConnection = await findTranscriptionConnection( + ctx, + organizationId, + ); + + if (!transcriptionConnection) { + return c.json( + { + error: + "No transcription service configured. Please add a connection with transcription capabilities (e.g., OpenAI Whisper).", + }, + 400, + ); + } + + // 6. Handle audio upload if blob provided + let finalAudioUrl = audioUrl; + let tempFileKey: string | null = null; + let objectStorageConnection: ConnectionEntity | null = null; + + if (audioFile && !audioUrl) { + // Find object storage connection for temporary upload + objectStorageConnection = await findObjectStorageConnection( + ctx, + organizationId, + ); + + if (!objectStorageConnection) { + return c.json( + { + error: + "No object storage configured. Please add a connection with object storage capabilities (e.g., S3, R2, GCS) or provide an audioUrl instead.", + }, + 400, + ); + } + + try { + const uploadResult = await uploadAudioToObjectStorage( + ctx, + objectStorageConnection, + audioFile, + audioFile.type, + ); + finalAudioUrl = uploadResult.url; + tempFileKey = uploadResult.key; + } catch (error) { + console.error("[transcribe] Upload failed:", error); + return c.json( + { + error: `Failed to upload audio: ${error instanceof Error ? error.message : "Unknown error"}`, + }, + 500, + ); + } + } + + // 7. Call transcription service + try { + const proxy = await ctx.createMCPProxy(transcriptionConnection); + const transcriptionClient = TranscriptionBinding.forClient(proxy); + + const result = await transcriptionClient.TRANSCRIBE_AUDIO({ + audioUrl: finalAudioUrl ?? undefined, + mimeType: audioFile?.type, + language: language ?? undefined, + }); + + // 8. Cleanup temporary file + if (tempFileKey && objectStorageConnection) { + // Don't await - cleanup in background + void deleteAudioFromObjectStorage( + ctx, + objectStorageConnection, + tempFileKey, + ); + } + + // 9. Return result + return c.json({ + text: result.text, + language: result.language, + duration: result.duration, + confidence: result.confidence, + }); + } catch (error) { + console.error("[transcribe] Transcription failed:", error); + + // Cleanup on error + if (tempFileKey && objectStorageConnection) { + void deleteAudioFromObjectStorage( + ctx, + objectStorageConnection, + tempFileKey, + ); + } + + return c.json( + { + error: `Transcription failed: ${error instanceof Error ? error.message : "Unknown error"}`, + }, + 500, + ); + } +}); + +export default app; diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx index d090fa08b0..539b35ca0b 100644 --- a/apps/mesh/src/web/components/chat/context.tsx +++ b/apps/mesh/src/web/components/chat/context.tsx @@ -25,7 +25,9 @@ import { type PropsWithChildren, } from "react"; import { toast } from "sonner"; +import { useConnections } from "../../hooks/collections/use-connection"; import { useModelConnections } from "../../hooks/collections/use-llm"; +import { useBindingConnections } from "../../hooks/use-binding"; import { getThreadFromIndexedDB, useMessageActions, @@ -122,6 +124,10 @@ interface ChatContextValue { clearChatError: () => void; finishReason: string | null; clearFinishReason: () => void; + + // Binding availability + hasTranscriptionBinding: boolean; + hasObjectStorageBinding: boolean; } // ============================================================================ @@ -508,6 +514,19 @@ export function ChatProvider({ children }: PropsWithChildren) { const modelsConnections = useModelConnections(); const [selectedModel, setModel] = useModelState(locator, modelsConnections); + // Binding detection for transcription feature + const allConnections = useConnections(); + const transcriptionConnections = useBindingConnections({ + connections: allConnections, + binding: "TRANSCRIPTION", + }); + const objectStorageConnections = useBindingConnections({ + connections: allConnections, + binding: "OBJECT_STORAGE", + }); + const hasTranscriptionBinding = transcriptionConnections.length > 0; + const hasObjectStorageBinding = objectStorageConnections.length > 0; + // Context prompt const contextPrompt = useContextHook(storedSelectedVirtualMcpId); @@ -778,6 +797,10 @@ export function ChatProvider({ children }: PropsWithChildren) { clearChatError: chat.clearError, finishReason: chatState.finishReason, clearFinishReason, + + // Binding availability + hasTranscriptionBinding, + hasObjectStorageBinding, }; return {children}; diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx index 783ba5bf6b..1a101d9bfb 100644 --- a/apps/mesh/src/web/components/chat/input.tsx +++ b/apps/mesh/src/web/components/chat/input.tsx @@ -9,6 +9,11 @@ import { } from "@deco/ui/components/popover.tsx"; import { cn } from "@deco/ui/lib/utils.ts"; import { useNavigate } from "@tanstack/react-router"; +import { + Tooltip, + TooltipContent, + TooltipTrigger, +} from "@deco/ui/components/tooltip.tsx"; import { AlertCircle, AlertTriangle, @@ -17,11 +22,15 @@ import { CornerUpLeft, CpuChip02, Edit01, + Microphone01, Stop, + StopCircle, XCircle, } from "@untitledui/icons"; import type { FormEvent } from "react"; import { useEffect, useRef, useState, type MouseEvent } from "react"; +import { toast } from "sonner"; +import { useAudioRecorder } from "../../hooks/use-audio-recorder"; import { useChat } from "./context"; import { isTiptapDocEmpty } from "./tiptap/utils"; import { ChatHighlight } from "./index"; @@ -200,10 +209,33 @@ export function ChatInput() { clearChatError, finishReason, clearFinishReason, + hasTranscriptionBinding, + hasObjectStorageBinding, } = useChat(); + const { org } = useProjectContext(); + const tiptapRef = useRef(null); + // Audio recording state + const { + isRecording, + startRecording, + stopRecording, + error: recordingError, + clearError: clearRecordingError, + } = useAudioRecorder({ maxDuration: 3 * 60 * 1000 }); // 3 minutes max + const [isTranscribing, setIsTranscribing] = useState(false); + + // Show toast when recording error occurs + // oxlint-disable-next-line ban-use-effect/ban-use-effect + useEffect(() => { + if (recordingError) { + toast.error(recordingError.message); + clearRecordingError(); + } + }, [recordingError, clearRecordingError]); + const canSubmit = !isStreaming && !!selectedModel && !isTiptapDocEmpty(tiptapDoc); @@ -247,6 +279,60 @@ export function ChatInput() { void sendMessage(doc); }; + const handleRecordingToggle = async () => { + if (isTranscribing) return; + + if (isRecording) { + const audioBlob = await stopRecording(); + if (!audioBlob) { + toast.error("Falha ao gravar áudio"); + return; + } + + setIsTranscribing(true); + try { + const formData = new FormData(); + formData.append("audio", audioBlob, "recording.webm"); + + const response = await fetch(`/api/${org.slug}/transcribe`, { + method: "POST", + body: formData, + credentials: "include", + }); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error( + (errorData as { error?: string }).error || "Falha na transcrição", + ); + } + + const data = (await response.json()) as { text?: string }; + if (data.text) { + // Insert transcribed text into the input + const doc = { + type: "doc" as const, + content: [ + { + type: "paragraph", + content: [{ type: "text", text: data.text }], + }, + ], + }; + setTiptapDoc(doc); + } + } catch (err) { + toast.error( + err instanceof Error ? err.message : "Erro ao transcrever áudio", + ); + } finally { + setIsTranscribing(false); + } + } else { + await startRecording(); + } + }; + const color = selectedVirtualMcp ? getGatewayColor(selectedVirtualMcp.id) : null; @@ -399,6 +485,63 @@ export function ChatInput() { selectedModel={selectedModel} isStreaming={isStreaming} /> + {/* Audio Recording Button - only show if transcription and object storage bindings are available */} + {hasTranscriptionBinding && hasObjectStorageBinding && ( + + + + + + {isTranscribing + ? "Transcrevendo..." + : isRecording + ? "Clique para parar e transcrever" + : "Gravar áudio"} + + + )} - + + + {!hasTranscriptionBinding && ( - {isTranscribing - ? "Transcribing..." - : isRecording - ? "Click to stop and transcribe" - : "Record audio"} + Add a transcription MCP to enable voice input - - )} + )} +