From ad4e4cca8f524bb5ab9aa0f3e4b52e3d5a872f50 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 10:04:22 -0300
Subject: [PATCH 01/11] feat: add audio transcription functionality

- Introduced a new transcription API route to handle audio-to-text conversion.
- Implemented audio recording capabilities in the chat input component, allowing users to record and transcribe audio messages.
- Added hooks for audio recording management and binding detection for transcription and object storage.
- Updated the chat context to include binding availability for transcription services.
- Enhanced the UI to show recording options based on available bindings.
---
 apps/mesh/src/api/app.ts                      |   4 +
 apps/mesh/src/api/routes/transcribe.ts        | 360 ++++++++++++++++++
 apps/mesh/src/web/components/chat/context.tsx |  23 ++
 apps/mesh/src/web/components/chat/input.tsx   | 143 +++++++
 apps/mesh/src/web/hooks/use-audio-recorder.ts | 272 +++++++++++++
 apps/mesh/src/web/hooks/use-binding.ts        |   9 +-
 packages/bindings/src/index.ts                |  16 +
 .../bindings/src/well-known/transcription.ts  | 159 ++++++++
 8 files changed, 985 insertions(+), 1 deletion(-)
 create mode 100644 apps/mesh/src/api/routes/transcribe.ts
 create mode 100644 apps/mesh/src/web/hooks/use-audio-recorder.ts
 create mode 100644 packages/bindings/src/well-known/transcription.ts
diff --git a/apps/mesh/src/api/app.ts b/apps/mesh/src/api/app.ts
index b81fb10704..efd346ba66 100644
--- a/apps/mesh/src/api/app.ts
+++ b/apps/mesh/src/api/app.ts
@@ -35,6 +35,7 @@ import oauthProxyRoutes, {
 } from "./routes/oauth-proxy";
 import proxyRoutes from "./routes/proxy";
 import publicConfigRoutes from "./routes/public-config";
+import transcribeRoutes from "./routes/transcribe";
 import {
   isDecoHostedMcp,
   DECO_STORE_URL,
@@ -587,6 +588,9 @@ export function createApp(options: CreateAppOptions = {}) {
   // OpenAI-compatible LLM API routes
   app.route("/api", openaiCompatRoutes);
 
+  // Audio transcription routes
+  app.route("/api", transcribeRoutes);
+
   // Public Events endpoint
   app.post("/org/:organizationId/events/:type", async (c) => {
     const orgId = c.req.param("organizationId");
diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
new file mode 100644
index 0000000000..4f97eeb3b4
--- /dev/null
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -0,0 +1,360 @@
+/**
+ * Transcription API Route
+ *
+ * Provides audio transcription functionality by:
+ * 1. Receiving audio via FormData (blob) or URL
+ * 2. Finding a connection with TRANSCRIPTION_BINDING
+ * 3. Using OBJECT_STORAGE_BINDING for temporary upload if needed
+ * 4. Calling TRANSCRIBE_AUDIO and returning the result
+ */
+
+import {
+  TranscriptionBinding,
+  TRANSCRIPTION_BINDING,
+  OBJECT_STORAGE_BINDING,
+  SUPPORTED_AUDIO_FORMATS,
+  connectionImplementsBinding,
+} from "@decocms/bindings";
+import { Hono } from "hono";
+import type { MeshContext } from "../../core/mesh-context";
+import type { ConnectionEntity } from "../../tools/connection/schema";
+
+type Variables = {
+  meshContext: MeshContext;
+};
+
+const app = new Hono<{ Variables: Variables }>();
+
+const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB
+
+/**
+ * Find a connection that implements TRANSCRIPTION_BINDING
+ */
+async function findTranscriptionConnection(
+  ctx: MeshContext,
+  organizationId: string,
+): Promise<ConnectionEntity | null> {
+  const connections = await ctx.storage.connections.list(organizationId);
+  return (
+    connections.find(
+      (conn) =>
+        conn.status === "active" &&
+        connectionImplementsBinding(conn, TRANSCRIPTION_BINDING),
+    ) ?? null
+  );
+}
+
+/**
+ * Find a connection that implements OBJECT_STORAGE_BINDING
+ */
+async function findObjectStorageConnection(
+  ctx: MeshContext,
+  organizationId: string,
+): Promise<ConnectionEntity | null> {
+  const connections = await ctx.storage.connections.list(organizationId);
+  return (
+    connections.find(
+      (conn) =>
+        conn.status === "active" &&
+        connectionImplementsBinding(conn, OBJECT_STORAGE_BINDING),
+    ) ?? null
+  );
+}
+
+/**
+ * Upload audio to object storage and get a presigned URL
+ */
+async function uploadAudioToObjectStorage(
+  ctx: MeshContext,
+  connection: ConnectionEntity,
+  audioBlob: Blob,
+  mimeType: string,
+): Promise<{ url: string; key: string }> {
+  const proxy = await ctx.createMCPProxy(connection);
+
+  // Generate unique key for temporary audio file
+  const timestamp = Date.now();
+  const randomSuffix = Math.random().toString(36).substring(2, 8);
+  const extension = mimeType.split("/")[1]?.split(";")[0] || "webm";
+  const key = `_transcription_temp/${timestamp}-${randomSuffix}.${extension}`;
+
+  // Get presigned URL for upload
+  const putResult = await proxy.client.callTool({
+    name: "PUT_PRESIGNED_URL",
+    arguments: {
+      key,
+      contentType: mimeType,
+      expiresIn: 300, // 5 minutes
+    },
+  });
+
+  if (putResult.isError) {
+    const errorText =
+      putResult.content
+        .map((c: { type: string; text?: string }) =>
+          c.type === "text" ? c.text : "",
+        )
+        .join("\n") || "Failed to get upload URL";
+    throw new Error(errorText);
+  }
+
+  // Extract URL from result
+  const putContent = putResult.content.find(
+    (c: { type: string }) => c.type === "text",
+  );
+  if (!putContent || putContent.type !== "text") {
+    throw new Error("Invalid PUT_PRESIGNED_URL response");
+  }
+
+  const putData = JSON.parse((putContent as { text: string }).text) as {
+    url: string;
+  };
+
+  // Upload the audio blob
+  const uploadResponse = await fetch(putData.url, {
+    method: "PUT",
+    body: audioBlob,
+    headers: {
+      "Content-Type": mimeType,
+    },
+  });
+
+  if (!uploadResponse.ok) {
+    throw new Error(`Failed to upload audio: ${uploadResponse.statusText}`);
+  }
+
+  // Get presigned URL for reading
+  const getResult = await proxy.client.callTool({
+    name: "GET_PRESIGNED_URL",
+    arguments: {
+      key,
+      expiresIn: 300, // 5 minutes
+    },
+  });
+
+  if (getResult.isError) {
+    const errorText =
+      getResult.content
+        .map((c: { type: string; text?: string }) =>
+          c.type === "text" ? c.text : "",
+        )
+        .join("\n") || "Failed to get download URL";
+    throw new Error(errorText);
+  }
+
+  const getContent = getResult.content.find(
+    (c: { type: string }) => c.type === "text",
+  );
+  if (!getContent || getContent.type !== "text") {
+    throw new Error("Invalid GET_PRESIGNED_URL response");
+  }
+
+  const getData = JSON.parse((getContent as { text: string }).text) as {
+    url: string;
+  };
+
+  return { url: getData.url, key };
+}
+
+/**
+ * Delete temporary audio file from object storage
+ */
+async function deleteAudioFromObjectStorage(
+  ctx: MeshContext,
+  connection: ConnectionEntity,
+  key: string,
+): Promise<void> {
+  try {
+    const proxy = await ctx.createMCPProxy(connection);
+    await proxy.client.callTool({
+      name: "DELETE_OBJECT",
+      arguments: { key },
+    });
+  } catch (error) {
+    // Log but don't fail if cleanup fails
+    console.warn("[transcribe] Failed to cleanup temporary file:", key, error);
+  }
+}
+
+/**
+ * POST /:org/transcribe
+ *
+ * Transcribe audio to text using available transcription service.
+ *
+ * Request: FormData with:
+ * - audio: Blob (audio file)
+ * - audioUrl: string (optional, URL to audio file)
+ * - language: string (optional, ISO 639-1 language code)
+ *
+ * Response: { text, language, duration, confidence }
+ */
+app.post("/:org/transcribe", async (c) => {
+  const ctx = c.get("meshContext");
+  const orgSlug = c.req.param("org");
+
+  // 1. Validate auth
+  if (!ctx.auth.user?.id && !ctx.auth.apiKey?.id) {
+    return c.json({ error: "Authentication required" }, 401);
+  }
+
+  // 2. Validate organization
+  if (!ctx.organization) {
+    return c.json({ error: "Organization context required" }, 400);
+  }
+
+  if (ctx.organization.slug !== orgSlug && ctx.organization.id !== orgSlug) {
+    return c.json({ error: "Organization mismatch" }, 403);
+  }
+
+  const organizationId = ctx.organization.id;
+
+  // 3. Parse FormData
+  let formData: FormData;
+  try {
+    formData = await c.req.formData();
+  } catch {
+    return c.json({ error: "Invalid form data" }, 400);
+  }
+
+  const audioFile = formData.get("audio") as File | null;
+  const audioUrl = formData.get("audioUrl") as string | null;
+  const language = formData.get("language") as string | null;
+
+  if (!audioFile && !audioUrl) {
+    return c.json({ error: "Either audio file or audioUrl is required" }, 400);
+  }
+
+  // 4. Validate file size and format
+  if (audioFile) {
+    if (audioFile.size > MAX_FILE_SIZE) {
+      return c.json(
+        {
+          error: `File too large. Maximum size is ${MAX_FILE_SIZE / 1024 / 1024}MB`,
+        },
+        400,
+      );
+    }
+
+    const mimeType = audioFile.type.split(";")[0];
+    if (
+      !SUPPORTED_AUDIO_FORMATS.includes(
+        mimeType as (typeof SUPPORTED_AUDIO_FORMATS)[number],
+      )
+    ) {
+      return c.json(
+        {
+          error: `Unsupported audio format: ${mimeType}. Supported formats: ${SUPPORTED_AUDIO_FORMATS.join(", ")}`,
+        },
+        400,
+      );
+    }
+  }
+
+  // 5. Find transcription connection
+  const transcriptionConnection = await findTranscriptionConnection(
+    ctx,
+    organizationId,
+  );
+
+  if (!transcriptionConnection) {
+    return c.json(
+      {
+        error:
+          "No transcription service configured. Please add a connection with transcription capabilities (e.g., OpenAI Whisper).",
+      },
+      400,
+    );
+  }
+
+  // 6. Handle audio upload if blob provided
+  let finalAudioUrl = audioUrl;
+  let tempFileKey: string | null = null;
+  let objectStorageConnection: ConnectionEntity | null = null;
+
+  if (audioFile && !audioUrl) {
+    // Find object storage connection for temporary upload
+    objectStorageConnection = await findObjectStorageConnection(
+      ctx,
+      organizationId,
+    );
+
+    if (!objectStorageConnection) {
+      return c.json(
+        {
+          error:
+            "No object storage configured. Please add a connection with object storage capabilities (e.g., S3, R2, GCS) or provide an audioUrl instead.",
+        },
+        400,
+      );
+    }
+
+    try {
+      const uploadResult = await uploadAudioToObjectStorage(
+        ctx,
+        objectStorageConnection,
+        audioFile,
+        audioFile.type,
+      );
+      finalAudioUrl = uploadResult.url;
+      tempFileKey = uploadResult.key;
+    } catch (error) {
+      console.error("[transcribe] Upload failed:", error);
+      return c.json(
+        {
+          error: `Failed to upload audio: ${error instanceof Error ? error.message : "Unknown error"}`,
+        },
+        500,
+      );
+    }
+  }
+
+  // 7. Call transcription service
+  try {
+    const proxy = await ctx.createMCPProxy(transcriptionConnection);
+    const transcriptionClient = TranscriptionBinding.forClient(proxy);
+
+    const result = await transcriptionClient.TRANSCRIBE_AUDIO({
+      audioUrl: finalAudioUrl ?? undefined,
+      mimeType: audioFile?.type,
+      language: language ?? undefined,
+    });
+
+    // 8. Cleanup temporary file
+    if (tempFileKey && objectStorageConnection) {
+      // Don't await - cleanup in background
+      void deleteAudioFromObjectStorage(
+        ctx,
+        objectStorageConnection,
+        tempFileKey,
+      );
+    }
+
+    // 9. Return result
+    return c.json({
+      text: result.text,
+      language: result.language,
+      duration: result.duration,
+      confidence: result.confidence,
+    });
+  } catch (error) {
+    console.error("[transcribe] Transcription failed:", error);
+
+    // Cleanup on error
+    if (tempFileKey && objectStorageConnection) {
+      void deleteAudioFromObjectStorage(
+        ctx,
+        objectStorageConnection,
+        tempFileKey,
+      );
+    }
+
+    return c.json(
+      {
+        error: `Transcription failed: ${error instanceof Error ? error.message : "Unknown error"}`,
+      },
+      500,
+    );
+  }
+});
+
+export default app;
diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx
index d090fa08b0..539b35ca0b 100644
--- a/apps/mesh/src/web/components/chat/context.tsx
+++ b/apps/mesh/src/web/components/chat/context.tsx
@@ -25,7 +25,9 @@ import {
   type PropsWithChildren,
 } from "react";
 import { toast } from "sonner";
+import { useConnections } from "../../hooks/collections/use-connection";
 import { useModelConnections } from "../../hooks/collections/use-llm";
+import { useBindingConnections } from "../../hooks/use-binding";
 import {
   getThreadFromIndexedDB,
   useMessageActions,
@@ -122,6 +124,10 @@ interface ChatContextValue {
   clearChatError: () => void;
   finishReason: string | null;
   clearFinishReason: () => void;
+
+  // Binding availability
+  hasTranscriptionBinding: boolean;
+  hasObjectStorageBinding: boolean;
 }
 
 // ============================================================================
@@ -508,6 +514,19 @@ export function ChatProvider({ children }: PropsWithChildren) {
   const modelsConnections = useModelConnections();
   const [selectedModel, setModel] = useModelState(locator, modelsConnections);
 
+  // Binding detection for transcription feature
+  const allConnections = useConnections();
+  const transcriptionConnections = useBindingConnections({
+    connections: allConnections,
+    binding: "TRANSCRIPTION",
+  });
+  const objectStorageConnections = useBindingConnections({
+    connections: allConnections,
+    binding: "OBJECT_STORAGE",
+  });
+  const hasTranscriptionBinding = transcriptionConnections.length > 0;
+  const hasObjectStorageBinding = objectStorageConnections.length > 0;
+
   // Context prompt
   const contextPrompt = useContextHook(storedSelectedVirtualMcpId);
 
@@ -778,6 +797,10 @@ export function ChatProvider({ children }: PropsWithChildren) {
     clearChatError: chat.clearError,
     finishReason: chatState.finishReason,
     clearFinishReason,
+
+    // Binding availability
+    hasTranscriptionBinding,
+    hasObjectStorageBinding,
   };
 
   return <ChatContext.Provider value={value}>{children}</ChatContext.Provider>;
diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx
index 783ba5bf6b..1a101d9bfb 100644
--- a/apps/mesh/src/web/components/chat/input.tsx
+++ b/apps/mesh/src/web/components/chat/input.tsx
@@ -9,6 +9,11 @@ import {
 } from "@deco/ui/components/popover.tsx";
 import { cn } from "@deco/ui/lib/utils.ts";
 import { useNavigate } from "@tanstack/react-router";
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipTrigger,
+} from "@deco/ui/components/tooltip.tsx";
 import {
   AlertCircle,
   AlertTriangle,
@@ -17,11 +22,15 @@ import {
   CornerUpLeft,
   CpuChip02,
   Edit01,
+  Microphone01,
   Stop,
+  StopCircle,
   XCircle,
 } from "@untitledui/icons";
 import type { FormEvent } from "react";
 import { useEffect, useRef, useState, type MouseEvent } from "react";
+import { toast } from "sonner";
+import { useAudioRecorder } from "../../hooks/use-audio-recorder";
 import { useChat } from "./context";
 import { isTiptapDocEmpty } from "./tiptap/utils";
 import { ChatHighlight } from "./index";
@@ -200,10 +209,33 @@ export function ChatInput() {
     clearChatError,
     finishReason,
     clearFinishReason,
+    hasTranscriptionBinding,
+    hasObjectStorageBinding,
   } = useChat();
 
+  const { org } = useProjectContext();
+
   const tiptapRef = useRef<TiptapInputHandle | null>(null);
 
+  // Audio recording state
+  const {
+    isRecording,
+    startRecording,
+    stopRecording,
+    error: recordingError,
+    clearError: clearRecordingError,
+  } = useAudioRecorder({ maxDuration: 3 * 60 * 1000 }); // 3 minutes max
+  const [isTranscribing, setIsTranscribing] = useState(false);
+
+  // Show toast when recording error occurs
+  // oxlint-disable-next-line ban-use-effect/ban-use-effect
+  useEffect(() => {
+    if (recordingError) {
+      toast.error(recordingError.message);
+      clearRecordingError();
+    }
+  }, [recordingError, clearRecordingError]);
+
   const canSubmit =
     !isStreaming && !!selectedModel && !isTiptapDocEmpty(tiptapDoc);
 
@@ -247,6 +279,60 @@ export function ChatInput() {
     void sendMessage(doc);
   };
 
+  const handleRecordingToggle = async () => {
+    if (isTranscribing) return;
+
+    if (isRecording) {
+      const audioBlob = await stopRecording();
+      if (!audioBlob) {
+        toast.error("Falha ao gravar áudio");
+        return;
+      }
+
+      setIsTranscribing(true);
+      try {
+        const formData = new FormData();
+        formData.append("audio", audioBlob, "recording.webm");
+
+        const response = await fetch(`/api/${org.slug}/transcribe`, {
+          method: "POST",
+          body: formData,
+          credentials: "include",
+        });
+
+        if (!response.ok) {
+          const errorData = await response.json().catch(() => ({}));
+          throw new Error(
+            (errorData as { error?: string }).error || "Falha na transcrição",
+          );
+        }
+
+        const data = (await response.json()) as { text?: string };
+        if (data.text) {
+          // Insert transcribed text into the input
+          const doc = {
+            type: "doc" as const,
+            content: [
+              {
+                type: "paragraph",
+                content: [{ type: "text", text: data.text }],
+              },
+            ],
+          };
+          setTiptapDoc(doc);
+        }
+      } catch (err) {
+        toast.error(
+          err instanceof Error ? err.message : "Erro ao transcrever áudio",
+        );
+      } finally {
+        setIsTranscribing(false);
+      }
+    } else {
+      await startRecording();
+    }
+  };
+
   const color = selectedVirtualMcp
     ? getGatewayColor(selectedVirtualMcp.id)
     : null;
@@ -399,6 +485,63 @@ export function ChatInput() {
                     selectedModel={selectedModel}
                     isStreaming={isStreaming}
                   />
+                  {/* Audio Recording Button - only show if transcription and object storage bindings are available */}
+                  {hasTranscriptionBinding && hasObjectStorageBinding && (
+                    <Tooltip>
+                      <TooltipTrigger asChild>
+                        <Button
+                          type="button"
+                          variant="ghost"
+                          size="icon"
+                          disabled={
+                            !selectedModel || isStreaming || isTranscribing
+                          }
+                          onClick={handleRecordingToggle}
+                          className={cn(
+                            "size-8 rounded-full transition-all relative",
+                            isRecording &&
+                              "text-destructive hover:text-destructive",
+                          )}
+                        >
+                          {isTranscribing ? (
+                            <svg
+                              className="animate-spin size-5"
+                              viewBox="0 0 24 24"
+                              fill="none"
+                            >
+                              <circle
+                                className="opacity-25"
+                                cx="12"
+                                cy="12"
+                                r="10"
+                                stroke="currentColor"
+                                strokeWidth="4"
+                              />
+                              <path
+                                className="opacity-75"
+                                fill="currentColor"
+                                d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
+                              />
+                            </svg>
+                          ) : isRecording ? (
+                            <>
+                              <StopCircle size={20} />
+                              <span className="absolute inset-0 rounded-full animate-ping bg-destructive/20" />
+                            </>
+                          ) : (
+                            <Microphone01 size={20} />
+                          )}
+                        </Button>
+                      </TooltipTrigger>
+                      <TooltipContent side="top" sideOffset={8}>
+                        {isTranscribing
+                          ? "Transcrevendo..."
+                          : isRecording
+                            ? "Clique para parar e transcrever"
+                            : "Gravar áudio"}
+                      </TooltipContent>
+                    </Tooltip>
+                  )}
                   <Button
                     type={isStreaming ? "button" : "submit"}
                     onClick={(e: React.MouseEvent<HTMLButtonElement>) => {
diff --git a/apps/mesh/src/web/hooks/use-audio-recorder.ts b/apps/mesh/src/web/hooks/use-audio-recorder.ts
new file mode 100644
index 0000000000..8e2ce4965d
--- /dev/null
+++ b/apps/mesh/src/web/hooks/use-audio-recorder.ts
@@ -0,0 +1,272 @@
+/**
+ * Audio Recorder Hook
+ *
+ * Provides audio recording functionality using the MediaRecorder API.
+ * Handles permission requests, recording state, and blob generation.
+ */
+
+import { useRef, useState } from "react";
+
+export interface UseAudioRecorderReturn {
+  /** Whether recording is currently in progress */
+  isRecording: boolean;
+  /** Whether the recorder is initializing (getting permissions) */
+  isPending: boolean;
+  /** Start recording audio */
+  startRecording: () => Promise<void>;
+  /** Stop recording and return the audio blob */
+  stopRecording: () => Promise<Blob | null>;
+  /** Current error, if any */
+  error: Error | null;
+  /** Clear the current error */
+  clearError: () => void;
+}
+
+export interface UseAudioRecorderOptions {
+  /** Maximum recording duration in milliseconds (default: 5 minutes) */
+  maxDuration?: number;
+  /** Preferred MIME type for recording */
+  mimeType?: string;
+}
+
+/**
+ * Preferred MIME types in order of preference
+ * These are commonly supported across browsers
+ */
+const PREFERRED_MIME_TYPES = [
+  "audio/webm;codecs=opus",
+  "audio/webm",
+  "audio/mp4",
+  "audio/ogg;codecs=opus",
+];
+
+/**
+ * Get the best supported MIME type for recording
+ */
+function getSupportedMimeType(preferredType?: string): string {
+  // Check preferred type first
+  if (preferredType && MediaRecorder.isTypeSupported(preferredType)) {
+    return preferredType;
+  }
+
+  // Find first supported type from our list
+  for (const type of PREFERRED_MIME_TYPES) {
+    if (MediaRecorder.isTypeSupported(type)) {
+      return type;
+    }
+  }
+
+  // Fallback to empty string (browser default)
+  return "";
+}
+
+/**
+ * Hook for recording audio using the MediaRecorder API
+ *
+ * @param options - Recording options
+ * @returns Recording state and control functions
+ *
+ * @example
+ * ```tsx
+ * const { isRecording, startRecording, stopRecording, error } = useAudioRecorder();
+ *
+ * const handleToggle = async () => {
+ *   if (isRecording) {
+ *     const blob = await stopRecording();
+ *     if (blob) {
+ *       // Do something with the audio blob
+ *     }
+ *   } else {
+ *     await startRecording();
+ *   }
+ * };
+ * ```
+ */
+export function useAudioRecorder(
+  options: UseAudioRecorderOptions = {},
+): UseAudioRecorderReturn {
+  const { maxDuration = 5 * 60 * 1000, mimeType: preferredMimeType } = options;
+
+  const [isRecording, setIsRecording] = useState(false);
+  const [isPending, setIsPending] = useState(false);
+  const [error, setError] = useState<Error | null>(null);
+
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const mediaStreamRef = useRef<MediaStream | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const timeoutRef = useRef<number | null>(null);
+  const resolveStopRef = useRef<((blob: Blob | null) => void) | null>(null);
+
+  const clearError = () => setError(null);
+
+  /**
+   * Cleanup all resources
+   */
+  const cleanup = () => {
+    // Clear timeout
+    if (timeoutRef.current !== null) {
+      clearTimeout(timeoutRef.current);
+      timeoutRef.current = null;
+    }
+
+    // Stop all tracks
+    if (mediaStreamRef.current) {
+      mediaStreamRef.current.getTracks().forEach((track) => track.stop());
+      mediaStreamRef.current = null;
+    }
+
+    // Clear recorder reference
+    mediaRecorderRef.current = null;
+    chunksRef.current = [];
+  };
+
+  /**
+   * Start recording audio
+   */
+  const startRecording = async (): Promise<void> => {
+    // Check browser support
+    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+      setError(new Error("Audio recording is not supported in this browser"));
+      return;
+    }
+
+    if (!window.MediaRecorder) {
+      setError(new Error("MediaRecorder API is not supported in this browser"));
+      return;
+    }
+
+    setIsPending(true);
+    setError(null);
+
+    try {
+      // Request microphone permission
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          echoCancellation: true,
+          noiseSuppression: true,
+          sampleRate: 44100,
+        },
+      });
+
+      mediaStreamRef.current = stream;
+
+      // Get supported MIME type
+      const mimeType = getSupportedMimeType(preferredMimeType);
+
+      // Create MediaRecorder
+      const recorder = new MediaRecorder(stream, {
+        mimeType: mimeType || undefined,
+      });
+
+      mediaRecorderRef.current = recorder;
+      chunksRef.current = [];
+
+      // Handle data available
+      recorder.ondataavailable = (event) => {
+        if (event.data.size > 0) {
+          chunksRef.current.push(event.data);
+        }
+      };
+
+      // Handle recording stop
+      recorder.onstop = () => {
+        const blob = new Blob(chunksRef.current, {
+          type: recorder.mimeType || "audio/webm",
+        });
+
+        // Resolve the stop promise with the blob
+        if (resolveStopRef.current) {
+          resolveStopRef.current(blob);
+          resolveStopRef.current = null;
+        }
+
+        cleanup();
+        setIsRecording(false);
+      };
+
+      // Handle errors
+      recorder.onerror = (event) => {
+        console.error("[useAudioRecorder] Recording error:", event);
+        setError(new Error("Recording failed"));
+
+        if (resolveStopRef.current) {
+          resolveStopRef.current(null);
+          resolveStopRef.current = null;
+        }
+
+        cleanup();
+        setIsRecording(false);
+      };
+
+      // Start recording
+      recorder.start(1000); // Collect data every second
+      setIsRecording(true);
+      setIsPending(false);
+
+      // Set maximum duration timeout
+      timeoutRef.current = window.setTimeout(() => {
+        if (mediaRecorderRef.current?.state === "recording") {
+          mediaRecorderRef.current.stop();
+        }
+      }, maxDuration);
+    } catch (err) {
+      cleanup();
+      setIsPending(false);
+
+      if (err instanceof DOMException) {
+        if (err.name === "NotAllowedError") {
+          setError(
+            new Error(
+              "Microphone access denied. Please allow microphone access and try again.",
+            ),
+          );
+        } else if (err.name === "NotFoundError") {
+          setError(
+            new Error(
+              "No microphone found. Please connect a microphone and try again.",
+            ),
+          );
+        } else {
+          setError(new Error(`Failed to access microphone: ${err.message}`));
+        }
+      } else {
+        setError(
+          err instanceof Error ? err : new Error("Failed to start recording"),
+        );
+      }
+    }
+  };
+
+  /**
+   * Stop recording and return the audio blob
+   */
+  const stopRecording = (): Promise<Blob | null> => {
+    return new Promise((resolve) => {
+      if (!mediaRecorderRef.current || !isRecording) {
+        resolve(null);
+        return;
+      }
+
+      // Store resolve function to be called in onstop handler
+      resolveStopRef.current = resolve;
+
+      // Stop the recorder
+      if (mediaRecorderRef.current.state === "recording") {
+        mediaRecorderRef.current.stop();
+      } else {
+        // If not recording, resolve immediately
+        resolve(null);
+        resolveStopRef.current = null;
+      }
+    });
+  };
+
+  return {
+    isRecording,
+    isPending,
+    startRecording,
+    stopRecording,
+    error,
+    clearError,
+  };
+}
diff --git a/apps/mesh/src/web/hooks/use-binding.ts b/apps/mesh/src/web/hooks/use-binding.ts
index b332839871..60a8bcd1e3 100644
--- a/apps/mesh/src/web/hooks/use-binding.ts
+++ b/apps/mesh/src/web/hooks/use-binding.ts
@@ -1,5 +1,10 @@
 import { z } from "zod";
-import { type Binder, createBindingChecker } from "@decocms/bindings";
+import {
+  type Binder,
+  createBindingChecker,
+  OBJECT_STORAGE_BINDING,
+  TRANSCRIPTION_BINDING,
+} from "@decocms/bindings";
 import {
   BaseCollectionEntitySchema,
   createCollectionBindings,
@@ -23,6 +28,8 @@ const BUILTIN_BINDINGS: Record<string, Binder> = {
   WORKFLOW_EXECUTION: WORKFLOW_EXECUTION_BINDING,
   ASSISTANTS: ASSISTANTS_BINDING,
   MCP: MCP_BINDING,
+  TRANSCRIPTION: TRANSCRIPTION_BINDING,
+  OBJECT_STORAGE: OBJECT_STORAGE_BINDING,
 };
 
 /**
diff --git a/packages/bindings/src/index.ts b/packages/bindings/src/index.ts
index 3ccaa3fc64..860530a1e3 100644
--- a/packages/bindings/src/index.ts
+++ b/packages/bindings/src/index.ts
@@ -108,3 +108,19 @@ export {
   type DeleteObjectsInput,
   type DeleteObjectsOutput,
 } from "./well-known/object-storage";
+
+// Re-export transcription binding types
+export {
+  TRANSCRIPTION_BINDING,
+  TranscriptionBinding,
+  TranscriptionInputSchema,
+  type TranscriptionInput,
+  TranscriptionOutputSchema,
+  type TranscriptionOutput,
+  TranscriptionWordSchema,
+  type TranscriptionWord,
+  TranscriptionSegmentSchema,
+  type TranscriptionSegment,
+  SUPPORTED_AUDIO_FORMATS,
+  type TranscriptionBindingType,
+} from "./well-known/transcription";
diff --git a/packages/bindings/src/well-known/transcription.ts b/packages/bindings/src/well-known/transcription.ts
new file mode 100644
index 0000000000..6e0ada637a
--- /dev/null
+++ b/packages/bindings/src/well-known/transcription.ts
@@ -0,0 +1,159 @@
+/**
+ * Transcription Well-Known Binding
+ *
+ * Defines the interface for audio transcription operations.
+ * Any MCP that implements this binding can provide audio-to-text
+ * transcription capabilities (e.g., OpenAI Whisper, Google Speech-to-Text).
+ *
+ * This binding includes:
+ * - TRANSCRIBE_AUDIO: Transcribe audio to text
+ */
+
+import { z } from "zod";
+import { bindingClient, type ToolBinder } from "../core/binder";
+
+/**
+ * Supported audio formats for transcription
+ */
+export const SUPPORTED_AUDIO_FORMATS = [
+  "audio/webm",
+  "audio/mp3",
+  "audio/mpeg",
+  "audio/mp4",
+  "audio/m4a",
+  "audio/wav",
+  "audio/ogg",
+  "audio/flac",
+  "video/webm",
+] as const;
+
+// ============================================================================
+// Tool Schemas
+// ============================================================================
+
+/**
+ * TRANSCRIBE_AUDIO Input Schema
+ */
+export const TranscriptionInputSchema = z.object({
+  audio: z.string().optional().describe("Base64-encoded audio data"),
+  audioUrl: z
+    .string()
+    .url()
+    .optional()
+    .describe("URL pointing to the audio file"),
+  mimeType: z
+    .string()
+    .optional()
+    .describe("MIME type of the audio file (e.g., audio/webm, audio/mp3)"),
+  language: z
+    .string()
+    .optional()
+    .describe(
+      "Language hint for transcription (ISO 639-1 code, e.g., en, pt, es)",
+    ),
+  prompt: z
+    .string()
+    .optional()
+    .describe("Optional prompt to guide the transcription with context"),
+  includeTimestamps: z
+    .boolean()
+    .optional()
+    .describe("Whether to include word-level timestamps"),
+  includeSpeakerLabels: z
+    .boolean()
+    .optional()
+    .describe("Whether to identify and label different speakers"),
+});
+
+export type TranscriptionInput = z.infer<typeof TranscriptionInputSchema>;
+
+/**
+ * Word-level transcription detail
+ */
+export const TranscriptionWordSchema = z.object({
+  word: z.string().describe("The transcribed word"),
+  start: z.number().optional().describe("Start time in seconds"),
+  end: z.number().optional().describe("End time in seconds"),
+  confidence: z.number().optional().describe("Confidence score (0-1)"),
+  speaker: z
+    .string()
+    .optional()
+    .describe("Speaker label if diarization enabled"),
+});
+
+export type TranscriptionWord = z.infer<typeof TranscriptionWordSchema>;
+
+/**
+ * Segment-level transcription detail
+ */
+export const TranscriptionSegmentSchema = z.object({
+  text: z.string().describe("Transcribed text for this segment"),
+  start: z.number().optional().describe("Start time in seconds"),
+  end: z.number().optional().describe("End time in seconds"),
+  speaker: z
+    .string()
+    .optional()
+    .describe("Speaker label if diarization enabled"),
+  words: z
+    .array(TranscriptionWordSchema)
+    .optional()
+    .describe("Word-level details"),
+});
+
+export type TranscriptionSegment = z.infer<typeof TranscriptionSegmentSchema>;
+
+/**
+ * TRANSCRIBE_AUDIO Output Schema
+ */
+export const TranscriptionOutputSchema = z.object({
+  text: z.string().describe("The full transcribed text"),
+  language: z
+    .string()
+    .optional()
+    .describe("Detected or confirmed language (ISO 639-1 code)"),
+  duration: z.number().optional().describe("Duration of the audio in seconds"),
+  segments: z
+    .array(TranscriptionSegmentSchema)
+    .optional()
+    .describe("Segments with timestamps and optional speaker labels"),
+  confidence: z.number().optional().describe("Overall confidence score (0-1)"),
+  providerMetadata: z
+    .any()
+    .optional()
+    .describe("Additional provider-specific metadata"),
+});
+
+export type TranscriptionOutput = z.infer<typeof TranscriptionOutputSchema>;
+
+// ============================================================================
+// Binding Definition
+// ============================================================================
+
+/**
+ * Transcription Binding
+ *
+ * Defines the interface for audio transcription operations.
+ * Any MCP that implements this binding can be used for audio-to-text
+ * transcription in the chat interface.
+ *
+ * Required tools:
+ * - TRANSCRIBE_AUDIO: Transcribe audio to text
+ */
+export const TRANSCRIPTION_BINDING = [
+  {
+    name: "TRANSCRIBE_AUDIO" as const,
+    inputSchema: TranscriptionInputSchema,
+    outputSchema: TranscriptionOutputSchema,
+  } satisfies ToolBinder<
+    "TRANSCRIBE_AUDIO",
+    TranscriptionInput,
+    TranscriptionOutput
+  >,
+] as const;
+
+export type TranscriptionBindingType = typeof TRANSCRIPTION_BINDING;
+
+/**
+ * Transcription binding client for calling TRANSCRIBE_AUDIO
+ */
+export const TranscriptionBinding = bindingClient(TRANSCRIPTION_BINDING);

From f5f9348e7688c8b2c5fe84ec905b54eb3b978c7f Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 10:26:50 -0300
Subject: [PATCH 02/11] refactor: streamline connection binding functions and
 improve error messages

- Renamed and consolidated functions for finding connections with specific bindings to enhance code clarity and reusability.
- Updated error messages in the ChatInput component to provide clearer feedback to users regarding audio recording and transcription failures.
- Improved UI text for better user experience during audio recording and transcription processes.
---
 apps/mesh/src/api/routes/transcribe.ts        | 32 ++++++-------------
 apps/mesh/src/web/components/chat/context.tsx |  8 ++++-
 apps/mesh/src/web/components/chat/input.tsx   | 12 +++----
 3 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 4f97eeb3b4..92b83059bb 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -14,6 +14,7 @@ import {
   OBJECT_STORAGE_BINDING,
   SUPPORTED_AUDIO_FORMATS,
   connectionImplementsBinding,
+  type Binder,
 } from "@decocms/bindings";
 import { Hono } from "hono";
 import type { MeshContext } from "../../core/mesh-context";
@@ -28,35 +29,18 @@ const app = new Hono<{ Variables: Variables }>();
 const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB
 
 /**
- * Find a connection that implements TRANSCRIPTION_BINDING
+ * Find a connection that implements a specific binding
  */
-async function findTranscriptionConnection(
+async function findConnectionWithBinding(
   ctx: MeshContext,
   organizationId: string,
+  binding: Binder,
 ): Promise<ConnectionEntity | null> {
   const connections = await ctx.storage.connections.list(organizationId);
   return (
     connections.find(
       (conn) =>
-        conn.status === "active" &&
-        connectionImplementsBinding(conn, TRANSCRIPTION_BINDING),
-    ) ?? null
-  );
-}
-
-/**
- * Find a connection that implements OBJECT_STORAGE_BINDING
- */
-async function findObjectStorageConnection(
-  ctx: MeshContext,
-  organizationId: string,
-): Promise<ConnectionEntity | null> {
-  const connections = await ctx.storage.connections.list(organizationId);
-  return (
-    connections.find(
-      (conn) =>
-        conn.status === "active" &&
-        connectionImplementsBinding(conn, OBJECT_STORAGE_BINDING),
+        conn.status === "active" && connectionImplementsBinding(conn, binding),
     ) ?? null
   );
 }
@@ -251,9 +235,10 @@ app.post("/:org/transcribe", async (c) => {
   }
 
   // 5. Find transcription connection
-  const transcriptionConnection = await findTranscriptionConnection(
+  const transcriptionConnection = await findConnectionWithBinding(
     ctx,
     organizationId,
+    TRANSCRIPTION_BINDING,
   );
 
   if (!transcriptionConnection) {
@@ -273,9 +258,10 @@ app.post("/:org/transcribe", async (c) => {
 
   if (audioFile && !audioUrl) {
     // Find object storage connection for temporary upload
-    objectStorageConnection = await findObjectStorageConnection(
+    objectStorageConnection = await findConnectionWithBinding(
       ctx,
       organizationId,
+      OBJECT_STORAGE_BINDING,
     );
 
     if (!objectStorageConnection) {
diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx
index 621d8c303a..877445401b 100644
--- a/apps/mesh/src/web/components/chat/context.tsx
+++ b/apps/mesh/src/web/components/chat/context.tsx
@@ -138,7 +138,13 @@ const createModelsTransport = (
   new DefaultChatTransport<UIMessage<Metadata>>({
     api: `/api/${org}/decopilot/stream`,
     credentials: "include",
-    prepareSendMessagesRequest: ({ messages, requestMetadata = {} }: { messages: Message[]; requestMetadata: Metadata }) => {
+    prepareSendMessagesRequest: ({
+      messages,
+      requestMetadata = {},
+    }: {
+      messages: Message[];
+      requestMetadata: Metadata;
+    }) => {
       const {
         system,
         tiptapDoc: _tiptapDoc,
diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx
index f6ba35791a..34e0d5951e 100644
--- a/apps/mesh/src/web/components/chat/input.tsx
+++ b/apps/mesh/src/web/components/chat/input.tsx
@@ -274,7 +274,7 @@ export function ChatInput() {
     if (isRecording) {
       const audioBlob = await stopRecording();
       if (!audioBlob) {
-        toast.error("Falha ao gravar áudio");
+        toast.error("Failed to record audio");
         return;
       }
 
@@ -292,7 +292,7 @@ export function ChatInput() {
         if (!response.ok) {
           const errorData = await response.json().catch(() => ({}));
           throw new Error(
-            (errorData as { error?: string }).error || "Falha na transcrição",
+            (errorData as { error?: string }).error || "Transcription failed",
           );
         }
 
@@ -312,7 +312,7 @@ export function ChatInput() {
         }
       } catch (err) {
         toast.error(
-          err instanceof Error ? err.message : "Erro ao transcrever áudio",
+          err instanceof Error ? err.message : "Failed to transcribe audio",
         );
       } finally {
         setIsTranscribing(false);
@@ -503,10 +503,10 @@ export function ChatInput() {
                       </TooltipTrigger>
                       <TooltipContent side="top" sideOffset={8}>
                         {isTranscribing
-                          ? "Transcrevendo..."
+                          ? "Transcribing..."
                           : isRecording
-                            ? "Clique para parar e transcrever"
-                            : "Gravar áudio"}
+                            ? "Click to stop and transcribe"
+                            : "Record audio"}
                       </TooltipContent>
                     </Tooltip>
                   )}

From a540fec5eec72082a6eb12821691fe54174a64e6 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 10:32:45 -0300
Subject: [PATCH 03/11] feat: implement audio URL validation to prevent SSRF
 attacks

- Added a new function to validate audio URLs, ensuring only HTTP/HTTPS URLs with public hosts are accepted.
- Updated the transcription API route to validate the audioUrl parameter before processing.
- Enhanced the TranscriptionInputSchema to enforce the requirement of either 'audio' or 'audioUrl' for transcription requests.
- Improved the audio recorder hook to check the actual state of the media recorder before stopping it.
---
 apps/mesh/src/api/routes/transcribe.ts        | 66 ++++++++++++++++++-
 apps/mesh/src/web/hooks/use-audio-recorder.ts | 14 ++--
 .../bindings/src/well-known/transcription.ts  | 64 +++++++++---------
 3 files changed, 105 insertions(+), 39 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 92b83059bb..536f25a224 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -28,6 +28,62 @@ const app = new Hono<{ Variables: Variables }>();
 
 const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB
 
+/**
+ * Validate audioUrl to prevent SSRF attacks
+ * Only allows HTTP/HTTPS URLs with public hosts
+ */
+function validateAudioUrl(
+  urlString: string,
+): { valid: true } | { valid: false; error: string } {
+  let url: URL;
+  try {
+    url = new URL(urlString);
+  } catch {
+    return { valid: false, error: "Invalid URL format" };
+  }
+
+  // Only allow HTTP/HTTPS schemes
+  if (url.protocol !== "http:" && url.protocol !== "https:") {
+    return { valid: false, error: "Only HTTP and HTTPS URLs are allowed" };
+  }
+
+  const hostname = url.hostname.toLowerCase();
+
+  // Block localhost and loopback addresses
+  if (
+    hostname === "localhost" ||
+    hostname === "127.0.0.1" ||
+    hostname === "[::1]" ||
+    hostname === "::1"
+  ) {
+    return { valid: false, error: "Localhost URLs are not allowed" };
+  }
+
+  // Block private IP ranges (basic check)
+  // 10.x.x.x, 172.16-31.x.x, 192.168.x.x, 169.254.x.x (link-local/AWS metadata)
+  const ipv4Match = hostname.match(
+    /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/,
+  );
+  if (ipv4Match) {
+    const [, a, b] = ipv4Match.map(Number);
+    if (
+      a === 10 ||
+      a === 127 ||
+      (a === 172 && b >= 16 && b <= 31) ||
+      (a === 192 && b === 168) ||
+      (a === 169 && b === 254) ||
+      a === 0
+    ) {
+      return {
+        valid: false,
+        error: "Private or internal IP addresses are not allowed",
+      };
+    }
+  }
+
+  return { valid: true };
+}
+
 /**
  * Find a connection that implements a specific binding
  */
@@ -208,7 +264,15 @@ app.post("/:org/transcribe", async (c) => {
     return c.json({ error: "Either audio file or audioUrl is required" }, 400);
   }
 
-  // 4. Validate file size and format
+  // 4. Validate audioUrl if provided (prevent SSRF)
+  if (audioUrl) {
+    const urlValidation = validateAudioUrl(audioUrl);
+    if (!urlValidation.valid) {
+      return c.json({ error: urlValidation.error }, 400);
+    }
+  }
+
+  // 5. Validate file size and format (if file provided)
   if (audioFile) {
     if (audioFile.size > MAX_FILE_SIZE) {
       return c.json(
diff --git a/apps/mesh/src/web/hooks/use-audio-recorder.ts b/apps/mesh/src/web/hooks/use-audio-recorder.ts
index 8e2ce4965d..8f7dd0ab70 100644
--- a/apps/mesh/src/web/hooks/use-audio-recorder.ts
+++ b/apps/mesh/src/web/hooks/use-audio-recorder.ts
@@ -242,7 +242,11 @@ export function useAudioRecorder(
    */
   const stopRecording = (): Promise<Blob | null> => {
     return new Promise((resolve) => {
-      if (!mediaRecorderRef.current || !isRecording) {
+      // Check recorder's actual state instead of React state to avoid stale closures
+      if (
+        !mediaRecorderRef.current ||
+        mediaRecorderRef.current.state !== "recording"
+      ) {
         resolve(null);
         return;
       }
@@ -251,13 +255,7 @@ export function useAudioRecorder(
       resolveStopRef.current = resolve;
 
       // Stop the recorder
-      if (mediaRecorderRef.current.state === "recording") {
-        mediaRecorderRef.current.stop();
-      } else {
-        // If not recording, resolve immediately
-        resolve(null);
-        resolveStopRef.current = null;
-      }
+      mediaRecorderRef.current.stop();
     });
   };
 
diff --git a/packages/bindings/src/well-known/transcription.ts b/packages/bindings/src/well-known/transcription.ts
index 6e0ada637a..015ea05913 100644
--- a/packages/bindings/src/well-known/transcription.ts
+++ b/packages/bindings/src/well-known/transcription.ts
@@ -34,36 +34,40 @@ export const SUPPORTED_AUDIO_FORMATS = [
 /**
  * TRANSCRIBE_AUDIO Input Schema
  */
-export const TranscriptionInputSchema = z.object({
-  audio: z.string().optional().describe("Base64-encoded audio data"),
-  audioUrl: z
-    .string()
-    .url()
-    .optional()
-    .describe("URL pointing to the audio file"),
-  mimeType: z
-    .string()
-    .optional()
-    .describe("MIME type of the audio file (e.g., audio/webm, audio/mp3)"),
-  language: z
-    .string()
-    .optional()
-    .describe(
-      "Language hint for transcription (ISO 639-1 code, e.g., en, pt, es)",
-    ),
-  prompt: z
-    .string()
-    .optional()
-    .describe("Optional prompt to guide the transcription with context"),
-  includeTimestamps: z
-    .boolean()
-    .optional()
-    .describe("Whether to include word-level timestamps"),
-  includeSpeakerLabels: z
-    .boolean()
-    .optional()
-    .describe("Whether to identify and label different speakers"),
-});
+export const TranscriptionInputSchema = z
+  .object({
+    audio: z.string().optional().describe("Base64-encoded audio data"),
+    audioUrl: z
+      .string()
+      .url()
+      .optional()
+      .describe("URL pointing to the audio file"),
+    mimeType: z
+      .string()
+      .optional()
+      .describe("MIME type of the audio file (e.g., audio/webm, audio/mp3)"),
+    language: z
+      .string()
+      .optional()
+      .describe(
+        "Language hint for transcription (ISO 639-1 code, e.g., en, pt, es)",
+      ),
+    prompt: z
+      .string()
+      .optional()
+      .describe("Optional prompt to guide the transcription with context"),
+    includeTimestamps: z
+      .boolean()
+      .optional()
+      .describe("Whether to include word-level timestamps"),
+    includeSpeakerLabels: z
+      .boolean()
+      .optional()
+      .describe("Whether to identify and label different speakers"),
+  })
+  .refine((data) => data.audio !== undefined || data.audioUrl !== undefined, {
+    message: "Either 'audio' or 'audioUrl' must be provided",
+  });
 
 export type TranscriptionInput = z.infer<typeof TranscriptionInputSchema>;
 

From 6a55353af8c1d9156c77d0df2fefec30b690cdc3 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 11:57:18 -0300
Subject: [PATCH 04/11] feat: enhance audio URL validation to prevent SSRF
 attacks

- Added a function to check if an IP address is private, improving the validation of audio URLs.
- Updated the validateAudioUrl function to resolve DNS and ensure that URLs do not resolve to private or internal IP addresses.
- Modified the transcription API route to await the validation of audioUrl, ensuring proper error handling for invalid URLs.
---
 apps/mesh/src/api/routes/transcribe.ts | 89 +++++++++++++++++---------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 536f25a224..0bc0995846 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -17,6 +17,7 @@ import {
   type Binder,
 } from "@decocms/bindings";
 import { Hono } from "hono";
+import { lookup } from "node:dns/promises";
 import type { MeshContext } from "../../core/mesh-context";
 import type { ConnectionEntity } from "../../tools/connection/schema";
 
@@ -28,13 +29,45 @@ const app = new Hono<{ Variables: Variables }>();
 
 const MAX_FILE_SIZE = 25 * 1024 * 1024; // 25MB
 
+/**
+ * Check if an IP address is private/internal
+ */
+function isPrivateIp(ip: string): boolean {
+  // IPv4 check
+  const ipv4Match = ip.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
+  if (ipv4Match) {
+    const [, a, b] = ipv4Match.map(Number);
+    return (
+      a === 10 || // 10.0.0.0/8
+      a === 127 || // 127.0.0.0/8 (loopback)
+      (a === 172 && b && b >= 16 && b <= 31) || // 172.16.0.0/12
+      (a === 192 && b === 168) || // 192.168.0.0/16
+      (a === 169 && b === 254) || // 169.254.0.0/16 (link-local, AWS metadata)
+      a === 0 // 0.0.0.0/8
+    );
+  }
+
+  // IPv6 check
+  const ipLower = ip.toLowerCase();
+  if (
+    ipLower === "::1" || // loopback
+    ipLower.startsWith("fe80:") || // link-local
+    ipLower.startsWith("fc") || // unique local (fc00::/7)
+    ipLower.startsWith("fd") // unique local (fc00::/7)
+  ) {
+    return true;
+  }
+
+  return false;
+}
+
 /**
  * Validate audioUrl to prevent SSRF attacks
- * Only allows HTTP/HTTPS URLs with public hosts
+ * Checks URL format, scheme, and resolves DNS to verify IPs are public
  */
-function validateAudioUrl(
+async function validateAudioUrl(
   urlString: string,
-): { valid: true } | { valid: false; error: string } {
+): Promise<{ valid: true } | { valid: false; error: string }> {
   let url: URL;
   try {
     url = new URL(urlString);
@@ -49,36 +82,32 @@ function validateAudioUrl(
 
   const hostname = url.hostname.toLowerCase();
 
-  // Block localhost and loopback addresses
-  if (
-    hostname === "localhost" ||
-    hostname === "127.0.0.1" ||
-    hostname === "[::1]" ||
-    hostname === "::1"
-  ) {
+  // Block localhost and loopback addresses (string check)
+  if (hostname === "localhost" || hostname === "[::1]") {
     return { valid: false, error: "Localhost URLs are not allowed" };
   }
 
-  // Block private IP ranges (basic check)
-  // 10.x.x.x, 172.16-31.x.x, 192.168.x.x, 169.254.x.x (link-local/AWS metadata)
-  const ipv4Match = hostname.match(
-    /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/,
-  );
-  if (ipv4Match) {
-    const [, a, b] = ipv4Match.map(Number);
-    if (
-      a === 10 ||
-      a === 127 ||
-      (a === 172 && b >= 16 && b <= 31) ||
-      (a === 192 && b === 168) ||
-      (a === 169 && b === 254) ||
-      a === 0
-    ) {
-      return {
-        valid: false,
-        error: "Private or internal IP addresses are not allowed",
-      };
+  // If hostname is already an IP, check it directly
+  if (isPrivateIp(hostname)) {
+    return {
+      valid: false,
+      error: "Private or internal IP addresses are not allowed",
+    };
+  }
+
+  // Resolve DNS and check all returned IPs to prevent DNS rebinding
+  try {
+    const results = await lookup(hostname, { all: true });
+    for (const { address } of results) {
+      if (isPrivateIp(address)) {
+        return {
+          valid: false,
+          error: "URL resolves to a private or internal IP address",
+        };
+      }
     }
+  } catch {
+    return { valid: false, error: "Failed to resolve hostname" };
   }
 
   return { valid: true };
@@ -266,7 +295,7 @@ app.post("/:org/transcribe", async (c) => {
 
   // 4. Validate audioUrl if provided (prevent SSRF)
   if (audioUrl) {
-    const urlValidation = validateAudioUrl(audioUrl);
+    const urlValidation = await validateAudioUrl(audioUrl);
     if (!urlValidation.valid) {
       return c.json({ error: urlValidation.error }, 400);
     }

From b9bbe4da55cc3010218a51e790dba029780b0e89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vin=C3=ADcius=20Ventura?= <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 12:05:59 -0300
Subject: [PATCH 05/11] Update apps/mesh/src/api/routes/transcribe.ts

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
---
 apps/mesh/src/api/routes/transcribe.ts | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 0bc0995846..e01098138a 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -49,6 +49,12 @@ function isPrivateIp(ip: string): boolean {
 
   // IPv6 check
   const ipLower = ip.toLowerCase();
+  const ipv4MappedMatch = ipLower.match(
+    /^::ffff:(\d{1,3}(?:\.\d{1,3}){3})$/,
+  );
+  if (ipv4MappedMatch && isPrivateIp(ipv4MappedMatch[1])) {
+    return true;
+  }
   if (
     ipLower === "::1" || // loopback
     ipLower.startsWith("fe80:") || // link-local

From 74f27a9e2a5453a685de20a0eed0cf0dfec34762 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Thu, 22 Jan 2026 15:34:33 -0300
Subject: [PATCH 06/11] fix: handle potential undefined value in IP address
 validation

- Updated the isPrivateIp function to safely handle undefined values when checking IPv4-mapped addresses, ensuring robust validation of IP addresses.
---
 apps/mesh/src/api/routes/transcribe.ts | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index e01098138a..58b78f2a39 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -49,10 +49,8 @@ function isPrivateIp(ip: string): boolean {
 
   // IPv6 check
   const ipLower = ip.toLowerCase();
-  const ipv4MappedMatch = ipLower.match(
-    /^::ffff:(\d{1,3}(?:\.\d{1,3}){3})$/,
-  );
-  if (ipv4MappedMatch && isPrivateIp(ipv4MappedMatch[1])) {
+  const ipv4MappedMatch = ipLower.match(/^::ffff:(\d{1,3}(?:\.\d{1,3}){3})$/);
+  if (ipv4MappedMatch && isPrivateIp(ipv4MappedMatch[1] ?? "")) {
     return true;
   }
   if (

From 9829d2cc199eeee7ce84c383cdc3731689c5e425 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Fri, 23 Jan 2026 17:19:19 -0300
Subject: [PATCH 07/11] Refactor chat context to use new connections hook and
 simplify message preparation logic

---
 apps/mesh/src/web/components/chat/context.tsx | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx
index 3cbf9e0a4e..af67edba49 100644
--- a/apps/mesh/src/web/components/chat/context.tsx
+++ b/apps/mesh/src/web/components/chat/context.tsx
@@ -14,6 +14,7 @@ import {
   useProjectContext,
   useVirtualMCPs,
   SELF_MCP_ALIAS_ID,
+  useConnections,
 } from "@decocms/mesh-sdk";
 import type { Client } from "@modelcontextprotocol/sdk/client/index.js";
 import type {
@@ -35,7 +36,6 @@ import {
   useReducer,
 } from "react";
 import { toast } from "sonner";
-import { useConnections } from "../../hooks/collections/use-connection";
 import { useBindingConnections } from "../../hooks/use-binding";
 import { useModelConnections } from "../../hooks/collections/use-llm";
 import { useThreadMessages } from "../../hooks/use-chat-store";
@@ -141,13 +141,7 @@ const createModelsTransport = (
   new DefaultChatTransport<UIMessage<Metadata>>({
     api: `/api/${org}/decopilot/stream`,
     credentials: "include",
-    prepareSendMessagesRequest: ({
-      messages,
-      requestMetadata = {},
-    }: {
-      messages: Message[];
-      requestMetadata: Metadata;
-    }) => {
+    prepareSendMessagesRequest: ({ messages, requestMetadata = {} }) => {
       const {
         system,
         tiptapDoc: _tiptapDoc,

From 71f22910e5e1b75ff36f57234a9a6f7e4f929323 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Fri, 23 Jan 2026 19:31:11 -0300
Subject: [PATCH 08/11] Refactor audio processing in transcription to convert
 Blob to data URL, removing object storage dependency. Update chat context to
 eliminate object storage binding checks.

---
 apps/mesh/src/api/routes/transcribe.ts        | 174 ++----------------
 apps/mesh/src/web/components/chat/context.tsx |   7 -
 apps/mesh/src/web/components/chat/input.tsx   |  31 ++--
 3 files changed, 28 insertions(+), 184 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 58b78f2a39..8d22e3182f 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -4,14 +4,13 @@
  * Provides audio transcription functionality by:
  * 1. Receiving audio via FormData (blob) or URL
  * 2. Finding a connection with TRANSCRIPTION_BINDING
- * 3. Using OBJECT_STORAGE_BINDING for temporary upload if needed
+ * 3. Converting audio to data URL (base64) for direct transcription
  * 4. Calling TRANSCRIBE_AUDIO and returning the result
  */
 
 import {
   TranscriptionBinding,
   TRANSCRIPTION_BINDING,
-  OBJECT_STORAGE_BINDING,
   SUPPORTED_AUDIO_FORMATS,
   connectionImplementsBinding,
   type Binder,
@@ -135,118 +134,12 @@ async function findConnectionWithBinding(
 }
 
 /**
- * Upload audio to object storage and get a presigned URL
+ * Convert a Blob to a data URL (base64)
  */
-async function uploadAudioToObjectStorage(
-  ctx: MeshContext,
-  connection: ConnectionEntity,
-  audioBlob: Blob,
-  mimeType: string,
-): Promise<{ url: string; key: string }> {
-  const proxy = await ctx.createMCPProxy(connection);
-
-  // Generate unique key for temporary audio file
-  const timestamp = Date.now();
-  const randomSuffix = Math.random().toString(36).substring(2, 8);
-  const extension = mimeType.split("/")[1]?.split(";")[0] || "webm";
-  const key = `_transcription_temp/${timestamp}-${randomSuffix}.${extension}`;
-
-  // Get presigned URL for upload
-  const putResult = await proxy.client.callTool({
-    name: "PUT_PRESIGNED_URL",
-    arguments: {
-      key,
-      contentType: mimeType,
-      expiresIn: 300, // 5 minutes
-    },
-  });
-
-  if (putResult.isError) {
-    const errorText =
-      putResult.content
-        .map((c: { type: string; text?: string }) =>
-          c.type === "text" ? c.text : "",
-        )
-        .join("\n") || "Failed to get upload URL";
-    throw new Error(errorText);
-  }
-
-  // Extract URL from result
-  const putContent = putResult.content.find(
-    (c: { type: string }) => c.type === "text",
-  );
-  if (!putContent || putContent.type !== "text") {
-    throw new Error("Invalid PUT_PRESIGNED_URL response");
-  }
-
-  const putData = JSON.parse((putContent as { text: string }).text) as {
-    url: string;
-  };
-
-  // Upload the audio blob
-  const uploadResponse = await fetch(putData.url, {
-    method: "PUT",
-    body: audioBlob,
-    headers: {
-      "Content-Type": mimeType,
-    },
-  });
-
-  if (!uploadResponse.ok) {
-    throw new Error(`Failed to upload audio: ${uploadResponse.statusText}`);
-  }
-
-  // Get presigned URL for reading
-  const getResult = await proxy.client.callTool({
-    name: "GET_PRESIGNED_URL",
-    arguments: {
-      key,
-      expiresIn: 300, // 5 minutes
-    },
-  });
-
-  if (getResult.isError) {
-    const errorText =
-      getResult.content
-        .map((c: { type: string; text?: string }) =>
-          c.type === "text" ? c.text : "",
-        )
-        .join("\n") || "Failed to get download URL";
-    throw new Error(errorText);
-  }
-
-  const getContent = getResult.content.find(
-    (c: { type: string }) => c.type === "text",
-  );
-  if (!getContent || getContent.type !== "text") {
-    throw new Error("Invalid GET_PRESIGNED_URL response");
-  }
-
-  const getData = JSON.parse((getContent as { text: string }).text) as {
-    url: string;
-  };
-
-  return { url: getData.url, key };
-}
-
-/**
- * Delete temporary audio file from object storage
- */
-async function deleteAudioFromObjectStorage(
-  ctx: MeshContext,
-  connection: ConnectionEntity,
-  key: string,
-): Promise<void> {
-  try {
-    const proxy = await ctx.createMCPProxy(connection);
-    await proxy.client.callTool({
-      name: "DELETE_OBJECT",
-      arguments: { key },
-    });
-  } catch (error) {
-    // Log but don't fail if cleanup fails
-    console.warn("[transcribe] Failed to cleanup temporary file:", key, error);
-  }
+async function blobToDataUrl(blob: Blob, mimeType: string): Promise<string> {
+  const arrayBuffer = await blob.arrayBuffer();
+  const base64 = Buffer.from(arrayBuffer).toString("base64");
+  return `data:${mimeType};base64,${base64}`;
 }
 
 /**
@@ -348,43 +241,17 @@ app.post("/:org/transcribe", async (c) => {
     );
   }
 
-  // 6. Handle audio upload if blob provided
+  // 6. Convert audio to data URL if blob provided
   let finalAudioUrl = audioUrl;
-  let tempFileKey: string | null = null;
-  let objectStorageConnection: ConnectionEntity | null = null;
 
   if (audioFile && !audioUrl) {
-    // Find object storage connection for temporary upload
-    objectStorageConnection = await findConnectionWithBinding(
-      ctx,
-      organizationId,
-      OBJECT_STORAGE_BINDING,
-    );
-
-    if (!objectStorageConnection) {
-      return c.json(
-        {
-          error:
-            "No object storage configured. Please add a connection with object storage capabilities (e.g., S3, R2, GCS) or provide an audioUrl instead.",
-        },
-        400,
-      );
-    }
-
     try {
-      const uploadResult = await uploadAudioToObjectStorage(
-        ctx,
-        objectStorageConnection,
-        audioFile,
-        audioFile.type,
-      );
-      finalAudioUrl = uploadResult.url;
-      tempFileKey = uploadResult.key;
+      finalAudioUrl = await blobToDataUrl(audioFile, audioFile.type);
     } catch (error) {
-      console.error("[transcribe] Upload failed:", error);
+      console.error("[transcribe] Failed to convert audio to data URL:", error);
       return c.json(
         {
-          error: `Failed to upload audio: ${error instanceof Error ? error.message : "Unknown error"}`,
+          error: `Failed to process audio: ${error instanceof Error ? error.message : "Unknown error"}`,
         },
         500,
       );
@@ -402,17 +269,7 @@ app.post("/:org/transcribe", async (c) => {
       language: language ?? undefined,
     });
 
-    // 8. Cleanup temporary file
-    if (tempFileKey && objectStorageConnection) {
-      // Don't await - cleanup in background
-      void deleteAudioFromObjectStorage(
-        ctx,
-        objectStorageConnection,
-        tempFileKey,
-      );
-    }
-
-    // 9. Return result
+    // 8. Return result
     return c.json({
       text: result.text,
       language: result.language,
@@ -422,15 +279,6 @@ app.post("/:org/transcribe", async (c) => {
   } catch (error) {
     console.error("[transcribe] Transcription failed:", error);
 
-    // Cleanup on error
-    if (tempFileKey && objectStorageConnection) {
-      void deleteAudioFromObjectStorage(
-        ctx,
-        objectStorageConnection,
-        tempFileKey,
-      );
-    }
-
     return c.json(
       {
         error: `Transcription failed: ${error instanceof Error ? error.message : "Unknown error"}`,
diff --git a/apps/mesh/src/web/components/chat/context.tsx b/apps/mesh/src/web/components/chat/context.tsx
index af67edba49..bb1d9b5b4a 100644
--- a/apps/mesh/src/web/components/chat/context.tsx
+++ b/apps/mesh/src/web/components/chat/context.tsx
@@ -128,7 +128,6 @@ interface ChatContextValue {
 
   // Binding availability
   hasTranscriptionBinding: boolean;
-  hasObjectStorageBinding: boolean;
 }
 
 // ============================================================================
@@ -570,12 +569,7 @@ export function ChatProvider({
     connections: allConnections,
     binding: "TRANSCRIPTION",
   });
-  const objectStorageConnections = useBindingConnections({
-    connections: allConnections,
-    binding: "OBJECT_STORAGE",
-  });
   const hasTranscriptionBinding = transcriptionConnections.length > 0;
-  const hasObjectStorageBinding = objectStorageConnections.length > 0;
 
   // Context prompt
   const contextPrompt = useContextHook(storedSelectedVirtualMcpId);
@@ -852,7 +846,6 @@ export function ChatProvider({
 
     // Binding availability
     hasTranscriptionBinding,
-    hasObjectStorageBinding,
   };
 
   return <ChatContext.Provider value={value}>{children}</ChatContext.Provider>;
diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx
index e3961f3b38..a9266072de 100644
--- a/apps/mesh/src/web/components/chat/input.tsx
+++ b/apps/mesh/src/web/components/chat/input.tsx
@@ -206,7 +206,6 @@ export function ChatInput() {
     finishReason,
     clearFinishReason,
     hasTranscriptionBinding,
-    hasObjectStorageBinding,
   } = useChat();
 
   const { org } = useProjectContext();
@@ -453,22 +452,28 @@ export function ChatInput() {
                     selectedModel={selectedModel}
                     isStreaming={isStreaming}
                   />
-                  {/* Audio Recording Button - only show if transcription and object storage bindings are available */}
-                  {hasTranscriptionBinding && hasObjectStorageBinding && (
-                    <Tooltip>
-                      <TooltipTrigger asChild>
+                  {/* Audio Recording Button - always visible, disabled if no transcription binding */}
+                  <Tooltip>
+                    <TooltipTrigger asChild>
+                      {/* Wrap in span to enable tooltip on disabled button */}
+                      <span className="inline-flex">
                         <Button
                           type="button"
                           variant="ghost"
                           size="icon"
                           disabled={
-                            !selectedModel || isStreaming || isTranscribing
+                            !hasTranscriptionBinding ||
+                            !selectedModel ||
+                            isStreaming ||
+                            isTranscribing
                           }
                           onClick={handleRecordingToggle}
                           className={cn(
                             "size-8 rounded-full transition-all relative",
                             isRecording &&
                               "text-destructive hover:text-destructive",
+                            !hasTranscriptionBinding &&
+                              "opacity-40 cursor-not-allowed",
                           )}
                         >
                           {isTranscribing ? (
@@ -500,16 +505,14 @@ export function ChatInput() {
                             <Microphone01 size={20} />
                           )}
                         </Button>
-                      </TooltipTrigger>
+                      </span>
+                    </TooltipTrigger>
+                    {!hasTranscriptionBinding && (
                       <TooltipContent side="top" sideOffset={8}>
-                        {isTranscribing
-                          ? "Transcribing..."
-                          : isRecording
-                            ? "Click to stop and transcribe"
-                            : "Record audio"}
+                        Add a transcription MCP to enable voice input
                       </TooltipContent>
-                    </Tooltip>
-                  )}
+                    )}
+                  </Tooltip>
                   <Button
                     type={isStreaming ? "button" : "submit"}
                     onClick={(e: React.MouseEvent<HTMLButtonElement>) => {

From c5549d6aad821f8907e72bc33407827e6bd247e8 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Fri, 23 Jan 2026 19:53:38 -0300
Subject: [PATCH 09/11] Enhance chat input to append transcribed text to
 existing content or create new document if empty. Improved handling of last
 paragraph content for seamless integration of transcriptions.

---
 apps/mesh/src/web/components/chat/input.tsx | 58 +++++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/apps/mesh/src/web/components/chat/input.tsx b/apps/mesh/src/web/components/chat/input.tsx
index a9266072de..d517e8a8cb 100644
--- a/apps/mesh/src/web/components/chat/input.tsx
+++ b/apps/mesh/src/web/components/chat/input.tsx
@@ -297,17 +297,55 @@ export function ChatInput() {
 
         const data = (await response.json()) as { text?: string };
         if (data.text) {
-          // Insert transcribed text into the input
-          const doc = {
-            type: "doc" as const,
-            content: [
-              {
+          // Append transcribed text to the existing input content
+          const transcribedText = data.text;
+
+          // If there's existing content, append to it; otherwise create new doc
+          if (tiptapDoc && tiptapDoc.content && tiptapDoc.content.length > 0) {
+            // Clone the existing document
+            const newContent = [...tiptapDoc.content];
+            const lastParagraphIndex = newContent.length - 1;
+            const lastParagraph = newContent[lastParagraphIndex];
+
+            // If last paragraph has content, append with a space; otherwise just add the text
+            if (
+              lastParagraph &&
+              lastParagraph.type === "paragraph" &&
+              lastParagraph.content &&
+              lastParagraph.content.length > 0
+            ) {
+              // Append to the last paragraph with a space separator
+              newContent[lastParagraphIndex] = {
+                ...lastParagraph,
+                content: [
+                  ...lastParagraph.content,
+                  { type: "text", text: ` ${transcribedText}` },
+                ],
+              };
+            } else {
+              // Last paragraph is empty, replace it with the transcribed text
+              newContent[lastParagraphIndex] = {
                 type: "paragraph",
-                content: [{ type: "text", text: data.text }],
-              },
-            ],
-          };
-          setTiptapDoc(doc);
+                content: [{ type: "text", text: transcribedText }],
+              };
+            }
+
+            setTiptapDoc({
+              type: "doc" as const,
+              content: newContent,
+            });
+          } else {
+            // No existing content, create new doc
+            setTiptapDoc({
+              type: "doc" as const,
+              content: [
+                {
+                  type: "paragraph",
+                  content: [{ type: "text", text: transcribedText }],
+                },
+              ],
+            });
+          }
         }
       } catch (err) {
         toast.error(

From 0f60ab535f910eaf2dbbbb0ef0845298d0c74735 Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Fri, 23 Jan 2026 19:55:29 -0300
Subject: [PATCH 10/11] Refactor audio processing in transcription to convert
 Blob to base64 string instead of data URL. Update related comments and error
 handling for improved clarity.

---
 apps/mesh/src/api/routes/transcribe.ts | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/apps/mesh/src/api/routes/transcribe.ts b/apps/mesh/src/api/routes/transcribe.ts
index 8d22e3182f..f1dc004bf8 100644
--- a/apps/mesh/src/api/routes/transcribe.ts
+++ b/apps/mesh/src/api/routes/transcribe.ts
@@ -4,7 +4,7 @@
  * Provides audio transcription functionality by:
  * 1. Receiving audio via FormData (blob) or URL
  * 2. Finding a connection with TRANSCRIPTION_BINDING
- * 3. Converting audio to data URL (base64) for direct transcription
+ * 3. Converting audio blob to base64 (passed via 'audio' field) or using URL directly
  * 4. Calling TRANSCRIBE_AUDIO and returning the result
  */
 
@@ -134,12 +134,11 @@ async function findConnectionWithBinding(
 }
 
 /**
- * Convert a Blob to a data URL (base64)
+ * Convert a Blob to base64 string
  */
-async function blobToDataUrl(blob: Blob, mimeType: string): Promise<string> {
+async function blobToBase64(blob: Blob): Promise<string> {
   const arrayBuffer = await blob.arrayBuffer();
-  const base64 = Buffer.from(arrayBuffer).toString("base64");
-  return `data:${mimeType};base64,${base64}`;
+  return Buffer.from(arrayBuffer).toString("base64");
 }
 
 /**
@@ -241,14 +240,14 @@ app.post("/:org/transcribe", async (c) => {
     );
   }
 
-  // 6. Convert audio to data URL if blob provided
-  let finalAudioUrl = audioUrl;
+  // 6. Convert audio to base64 if blob provided
+  let audioBase64: string | undefined;
 
   if (audioFile && !audioUrl) {
     try {
-      finalAudioUrl = await blobToDataUrl(audioFile, audioFile.type);
+      audioBase64 = await blobToBase64(audioFile);
     } catch (error) {
-      console.error("[transcribe] Failed to convert audio to data URL:", error);
+      console.error("[transcribe] Failed to convert audio to base64:", error);
       return c.json(
         {
           error: `Failed to process audio: ${error instanceof Error ? error.message : "Unknown error"}`,
@@ -264,7 +263,8 @@ app.post("/:org/transcribe", async (c) => {
     const transcriptionClient = TranscriptionBinding.forClient(proxy);
 
     const result = await transcriptionClient.TRANSCRIBE_AUDIO({
-      audioUrl: finalAudioUrl ?? undefined,
+      audio: audioBase64,
+      audioUrl: audioUrl ?? undefined,
       mimeType: audioFile?.type,
       language: language ?? undefined,
     });

From 046cb4bbb05d86515119e71bd07e8a4b589c9dda Mon Sep 17 00:00:00 2001
From: viniciusventura29 <viniciusventura29@gmail.com>
Date: Fri, 23 Jan 2026 20:27:09 -0300
Subject: [PATCH 11/11] Remove OBJECT_STORAGE_BINDING from BUILTIN_BINDINGS in
 use-binding hook to eliminate unnecessary dependency.

---
 apps/mesh/src/web/hooks/use-binding.ts | 2 --
 1 file changed, 2 deletions(-)

diff --git a/apps/mesh/src/web/hooks/use-binding.ts b/apps/mesh/src/web/hooks/use-binding.ts
index 60a8bcd1e3..2a6be0618d 100644
--- a/apps/mesh/src/web/hooks/use-binding.ts
+++ b/apps/mesh/src/web/hooks/use-binding.ts
@@ -2,7 +2,6 @@ import { z } from "zod";
 import {
   type Binder,
   createBindingChecker,
-  OBJECT_STORAGE_BINDING,
   TRANSCRIPTION_BINDING,
 } from "@decocms/bindings";
 import {
@@ -29,7 +28,6 @@ const BUILTIN_BINDINGS: Record<string, Binder> = {
   ASSISTANTS: ASSISTANTS_BINDING,
   MCP: MCP_BINDING,
   TRANSCRIPTION: TRANSCRIPTION_BINDING,
-  OBJECT_STORAGE: OBJECT_STORAGE_BINDING,
 };
 
 /**