From ceaf1416258c71f01b89797b91c6f7275377b3b3 Mon Sep 17 00:00:00 2001 From: Luis C Date: Thu, 7 Aug 2025 12:29:23 -0400 Subject: [PATCH 1/4] Add ASR support for Replicate provider --- packages/inference/src/providers/replicate.ts | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/packages/inference/src/providers/replicate.ts b/packages/inference/src/providers/replicate.ts index 4c7b8fb169..15d9c1bdb1 100644 --- a/packages/inference/src/providers/replicate.ts +++ b/packages/inference/src/providers/replicate.ts @@ -20,11 +20,14 @@ import type { BodyParams, HeaderParams, RequestArgs, UrlParams } from "../types. import { omit } from "../utils/omit.js"; import { TaskProviderHelper, + type AutomaticSpeechRecognitionTaskHelper, type ImageToImageTaskHelper, type TextToImageTaskHelper, type TextToVideoTaskHelper, } from "./providerHelper.js"; import type { ImageToImageArgs } from "../tasks/cv/imageToImage.js"; +import type { AutomaticSpeechRecognitionArgs } from "../tasks/audio/automaticSpeechRecognition.js"; +import type { AutomaticSpeechRecognitionOutput } from "@huggingface/tasks"; import { base64FromBytes } from "../utils/base64FromBytes.js"; export interface ReplicateOutput { output?: string | string[]; @@ -163,6 +166,61 @@ export class ReplicateTextToVideoTask extends ReplicateTask implements TextToVid } } +export class ReplicateAutomaticSpeechRecognitionTask extends ReplicateTask implements AutomaticSpeechRecognitionTaskHelper { + override preparePayload(params: BodyParams): Record { + return { + input: { + ...omit(params.args, ["inputs", "parameters"]), + ...(params.args.parameters as Record), + audio: params.args.inputs, // This will be processed in preparePayloadAsync + }, + version: params.model.includes(":") ? params.model.split(":")[1] : undefined, + }; + } + + async preparePayloadAsync(args: AutomaticSpeechRecognitionArgs): Promise { + const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined; + + if (!blob || !(blob instanceof Blob)) { + throw new Error("Audio input must be a Blob"); + } + + // Convert Blob to base64 data URL + const bytes = new Uint8Array(await blob.arrayBuffer()); + const base64 = base64FromBytes(bytes); + const audioInput = `data:${blob.type || "audio/wav"};base64,${base64}`; + + return { + ...("data" in args ? omit(args, "data") : omit(args, "inputs")), + inputs: audioInput, + }; + } + + override async getResponse(response: ReplicateOutput): Promise { + if ( + typeof response === "object" && + !!response && + "output" in response && + typeof response.output === "string" + ) { + return { text: response.output }; + } + + if ( + typeof response === "object" && + !!response && + "output" in response && + Array.isArray(response.output) && + response.output.length > 0 && + typeof response.output[0] === "string" + ) { + return { text: response.output[0] }; + } + + throw new InferenceClientProviderOutputError("Received malformed response from Replicate automatic-speech-recognition API"); + } +} + export class ReplicateImageToImageTask extends ReplicateTask implements ImageToImageTaskHelper { override preparePayload(params: BodyParams): Record { return { From af7a21e2a8593a5b6fd4fa94c3f4ba07fbbdf6e1 Mon Sep 17 00:00:00 2001 From: Luis C Date: Thu, 7 Aug 2025 15:09:32 -0400 Subject: [PATCH 2/4] Fix lint --- packages/inference/src/providers/replicate.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/packages/inference/src/providers/replicate.ts b/packages/inference/src/providers/replicate.ts index 15d9c1bdb1..240199cd3b 100644 --- a/packages/inference/src/providers/replicate.ts +++ b/packages/inference/src/providers/replicate.ts @@ -166,7 +166,10 @@ export class ReplicateTextToVideoTask extends ReplicateTask implements TextToVid } } -export class ReplicateAutomaticSpeechRecognitionTask extends ReplicateTask implements AutomaticSpeechRecognitionTaskHelper { +export class ReplicateAutomaticSpeechRecognitionTask + extends ReplicateTask + implements AutomaticSpeechRecognitionTaskHelper +{ override preparePayload(params: BodyParams): Record { return { input: { @@ -180,7 +183,7 @@ export class ReplicateAutomaticSpeechRecognitionTask extends ReplicateTask imple async preparePayloadAsync(args: AutomaticSpeechRecognitionArgs): Promise { const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined; - + if (!blob || !(blob instanceof Blob)) { throw new Error("Audio input must be a Blob"); } @@ -197,12 +200,7 @@ export class ReplicateAutomaticSpeechRecognitionTask extends ReplicateTask imple } override async getResponse(response: ReplicateOutput): Promise { - if ( - typeof response === "object" && - !!response && - "output" in response && - typeof response.output === "string" - ) { + if (typeof response === "object" && !!response && "output" in response && typeof response.output === "string") { return { text: response.output }; } @@ -217,7 +215,9 @@ export class ReplicateAutomaticSpeechRecognitionTask extends ReplicateTask imple return { text: response.output[0] }; } - throw new InferenceClientProviderOutputError("Received malformed response from Replicate automatic-speech-recognition API"); + throw new InferenceClientProviderOutputError( + "Received malformed response from Replicate automatic-speech-recognition API" + ); } } From 5299c1ca112432e32283d8ff8a2a0336785193b8 Mon Sep 17 00:00:00 2001 From: Luis C Date: Mon, 11 Aug 2025 20:35:22 -0400 Subject: [PATCH 3/4] Add ReplicateASR mapping --- packages/inference/src/lib/getProviderHelper.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/inference/src/lib/getProviderHelper.ts b/packages/inference/src/lib/getProviderHelper.ts index 72cc35bf62..4409927e47 100644 --- a/packages/inference/src/lib/getProviderHelper.ts +++ b/packages/inference/src/lib/getProviderHelper.ts @@ -143,6 +143,7 @@ export const PROVIDERS: Record Date: Tue, 12 Aug 2025 17:03:44 +0200 Subject: [PATCH 4/4] fix get response --- packages/inference/src/providers/replicate.ts | 29 ++++++++++--------- .../tasks/audio/automaticSpeechRecognition.ts | 5 ---- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/packages/inference/src/providers/replicate.ts b/packages/inference/src/providers/replicate.ts index 240199cd3b..75496fecee 100644 --- a/packages/inference/src/providers/replicate.ts +++ b/packages/inference/src/providers/replicate.ts @@ -200,21 +200,24 @@ export class ReplicateAutomaticSpeechRecognitionTask } override async getResponse(response: ReplicateOutput): Promise { - if (typeof response === "object" && !!response && "output" in response && typeof response.output === "string") { - return { text: response.output }; - } + if (typeof response?.output === "string") return { text: response.output }; + if (Array.isArray(response?.output) && typeof response.output[0] === "string") return { text: response.output[0] }; - if ( - typeof response === "object" && - !!response && - "output" in response && - Array.isArray(response.output) && - response.output.length > 0 && - typeof response.output[0] === "string" - ) { - return { text: response.output[0] }; + const out = response?.output as + | undefined + | { + transcription?: string; + translation?: string; + txt_file?: string; + }; + if (out && typeof out === "object") { + if (typeof out.transcription === "string") return { text: out.transcription }; + if (typeof out.translation === "string") return { text: out.translation }; + if (typeof out.txt_file === "string") { + const r = await fetch(out.txt_file); + return { text: await r.text() }; + } } - throw new InferenceClientProviderOutputError( "Received malformed response from Replicate automatic-speech-recognition API" ); diff --git a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts index a8ce6ebed6..c5a716d5bc 100644 --- a/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts +++ b/packages/inference/src/tasks/audio/automaticSpeechRecognition.ts @@ -4,7 +4,6 @@ import { getProviderHelper } from "../../lib/getProviderHelper.js"; import type { BaseArgs, Options } from "../../types.js"; import { innerRequest } from "../../utils/request.js"; import type { LegacyAudioInput } from "./utils.js"; -import { InferenceClientProviderOutputError } from "../../errors.js"; export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput); /** @@ -22,9 +21,5 @@ export async function automaticSpeechRecognition( ...options, task: "automatic-speech-recognition", }); - const isValidOutput = typeof res?.text === "string"; - if (!isValidOutput) { - throw new InferenceClientProviderOutputError("Received malformed response from automatic-speech-recognition API"); - } return providerHelper.getResponse(res); }