diff --git a/.env.example b/.env.example
index 1d854bc0e..cffee0eee 100644
--- a/.env.example
+++ b/.env.example
@@ -88,6 +88,9 @@ ASR_OPENAI_BASE_URL=
ASR_QWEN_API_KEY=
ASR_QWEN_BASE_URL=
+ASR_AZURE_API_KEY=
+ASR_AZURE_BASE_URL=https://{region}.api.cognitive.microsoft.com
+
# --- PDF Processing -----------------------------------------------------------
PDF_UNPDF_API_KEY=
diff --git a/components/settings/asr-settings.tsx b/components/settings/asr-settings.tsx
index 4a0cfb021..340a515de 100644
--- a/components/settings/asr-settings.tsx
+++ b/components/settings/asr-settings.tsx
@@ -33,6 +33,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
const setASRProviderConfig = useSettingsStore((state) => state.setASRProviderConfig);
const asrProvider = ASR_PROVIDERS[selectedProviderId] ?? ASR_PROVIDERS['openai-whisper'];
+ const availableModels = asrProvider.models ?? [];
const isServerConfigured = !!asrProvidersConfig[selectedProviderId]?.isServerConfigured;
const [showApiKey, setShowApiKey] = useState(false);
@@ -294,7 +295,7 @@ export function ASRSettings({ selectedProviderId }: ASRSettingsProps) {
)}
{/* Model Selection */}
- {asrProvider.models.length > 0 && (
+ {availableModels.length > 0 && (
- {asrProvider.models.map((model) => (
+ {availableModels.map((model) => (
{model.name}
diff --git a/components/settings/audio-settings.tsx b/components/settings/audio-settings.tsx
index d88590ac0..3983acb37 100644
--- a/components/settings/audio-settings.tsx
+++ b/components/settings/audio-settings.tsx
@@ -50,6 +50,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
+ 'azure-asr': t('settings.providerAzureASR'),
};
return names[providerId];
}
diff --git a/components/settings/index.tsx b/components/settings/index.tsx
index da9c11f56..1d5d721d2 100644
--- a/components/settings/index.tsx
+++ b/components/settings/index.tsx
@@ -134,6 +134,7 @@ function getASRProviderName(providerId: ASRProviderId, t: (key: string) => strin
'openai-whisper': t('settings.providerOpenAIWhisper'),
'browser-native': t('settings.providerBrowserNative'),
'qwen-asr': t('settings.providerQwenASR'),
+ 'azure-asr': t('settings.providerAzureASR'),
};
return names[providerId];
}
diff --git a/lib/audio/asr-providers.ts b/lib/audio/asr-providers.ts
index 93365899a..d4302c9e0 100644
--- a/lib/audio/asr-providers.ts
+++ b/lib/audio/asr-providers.ts
@@ -184,6 +184,9 @@ export async function transcribeAudio(
case 'qwen-asr':
return await transcribeQwenASR(config, audioBuffer);
+ case 'azure-asr':
+ return await transcribeAzureASR(config, audioBuffer);
+
default:
throw new Error(`Unsupported ASR provider: ${config.providerId}`);
}
@@ -326,6 +329,90 @@ async function transcribeQwenASR(
return { text: transcribedText };
}
+/**
+ * Azure STT implementation (Fast Transcription REST API)
+ * https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create
+ */
+async function transcribeAzureASR(
+ config: ASRModelConfig,
+ audioBuffer: Buffer | Blob,
+): Promise {
+ const rawBaseUrl = config.baseUrl || ASR_PROVIDERS['azure-asr'].defaultBaseUrl!;
+
+ if (!rawBaseUrl || rawBaseUrl.includes('{region}')) {
+ throw new Error('Azure STT base URL must include a real region');
+ }
+
+ let endpoint = rawBaseUrl.replace(/\/+$/, '');
+ if (/\.stt\.speech\.microsoft\.com$/i.test(endpoint)) {
+ endpoint = endpoint.replace(/\.stt\.speech\.microsoft\.com$/i, '.api.cognitive.microsoft.com');
+ }
+ if (!/\/speechtotext\/transcriptions:transcribe/i.test(endpoint)) {
+ endpoint = `${endpoint}/speechtotext/transcriptions:transcribe`;
+ }
+ const url = new URL(endpoint);
+ if (!url.searchParams.get('api-version')) {
+ url.searchParams.set('api-version', '2025-10-15');
+ }
+
+ let audioBlob: Blob;
+ if (audioBuffer instanceof Blob) {
+ audioBlob = audioBuffer;
+ } else {
+ audioBlob = new Blob([audioBuffer as unknown as BlobPart], { type: 'audio/webm' });
+ }
+
+ const formData = new FormData();
+ formData.append('audio', audioBlob, 'recording.webm');
+
+ const localeMap: Record = {
+ en: 'en-US',
+ zh: 'zh-CN',
+ ja: 'ja-JP',
+ ko: 'ko-KR',
+ de: 'de-DE',
+ fr: 'fr-FR',
+ es: 'es-ES',
+ it: 'it-IT',
+ pt: 'pt-BR',
+ ru: 'ru-RU',
+ ar: 'ar-SA',
+ hi: 'hi-IN',
+ };
+
+ if (config.language && config.language !== 'auto') {
+ const locale = localeMap[config.language] || config.language;
+ formData.append('definition', JSON.stringify({ locales: [locale] }));
+ }
+
+ const response = await fetch(url.toString(), {
+ method: 'POST',
+ headers: { 'Ocp-Apim-Subscription-Key': config.apiKey! },
+ body: formData,
+ });
+
+ if (!response.ok) {
+ const errorText = await response.text().catch(() => response.statusText);
+ throw new Error(`Azure STT error (${response.status}): ${errorText}`);
+ }
+
+ const data = (await response.json()) as {
+ combinedPhrases?: Array<{ text?: string }>;
+ phrases?: Array<{ text?: string }>;
+ };
+
+ const combinedText = data.combinedPhrases
+ ?.map((p) => p.text || '')
+ .filter(Boolean)
+ .join(' ');
+ const phraseText = data.phrases
+ ?.map((p) => p.text || '')
+ .filter(Boolean)
+ .join(' ');
+
+ return { text: combinedText || phraseText || '' };
+}
+
/**
* Get current ASR configuration from settings store
* Note: This function should only be called in browser context
diff --git a/lib/audio/constants.ts b/lib/audio/constants.ts
index 423f5b82c..2f580ab29 100644
--- a/lib/audio/constants.ts
+++ b/lib/audio/constants.ts
@@ -1031,6 +1031,32 @@ export const ASR_PROVIDERS: Record = {
supportedFormats: ['mp3', 'wav', 'webm', 'm4a', 'flac'],
},
+ 'azure-asr': {
+ id: 'azure-asr',
+ name: 'Azure STT',
+ requiresApiKey: true,
+ defaultBaseUrl: 'https://{region}.api.cognitive.microsoft.com',
+ icon: '/logos/azure.svg',
+ models: [],
+ defaultModelId: '',
+ supportedLanguages: [
+ 'auto',
+ 'en',
+ 'zh',
+ 'ja',
+ 'ko',
+ 'de',
+ 'fr',
+ 'es',
+ 'it',
+ 'pt',
+ 'ru',
+ 'ar',
+ 'hi',
+ ],
+ supportedFormats: ['wav', 'ogg', 'webm', 'mp3', 'flac', 'm4a'],
+ },
+
'browser-native': {
id: 'browser-native',
name: '浏览器原生 ASR (Web Speech API)',
diff --git a/lib/audio/types.ts b/lib/audio/types.ts
index 0c3c91792..d808a021c 100644
--- a/lib/audio/types.ts
+++ b/lib/audio/types.ts
@@ -16,6 +16,7 @@
* - OpenAI Whisper (https://platform.openai.com/docs/guides/speech-to-text)
* - Browser Native (Web Speech API, client-side only)
* - Qwen ASR (DashScope API)
+ * - Azure STT (https://learn.microsoft.com/azure/ai-services/speech-service/fast-transcription-create)
*
* Future Provider Support (extensible):
* - ElevenLabs TTS/ASR (https://elevenlabs.io/docs)
@@ -152,7 +153,7 @@ export interface TTSModelConfig {
* Add new ASR providers here as union members.
* Keep in sync with ASR_PROVIDERS registry in constants.ts
*/
-export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr';
+export type ASRProviderId = 'openai-whisper' | 'browser-native' | 'qwen-asr' | 'azure-asr';
// Add new ASR providers below (uncomment and modify):
// | 'elevenlabs-asr'
// | 'assemblyai-asr'
diff --git a/lib/i18n/locales/en-US.json b/lib/i18n/locales/en-US.json
index 997857983..1f9a1b90c 100644
--- a/lib/i18n/locales/en-US.json
+++ b/lib/i18n/locales/en-US.json
@@ -539,6 +539,7 @@
"providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)",
"providerBrowserNative": "Browser Native ASR",
"providerQwenASR": "Qwen ASR (Alibaba Cloud Bailian)",
+ "providerAzureASR": "Azure STT",
"providerUnpdf": "unpdf (Built-in)",
"providerMinerU": "MinerU",
"browserNativeTTSNote": "Browser Native TTS requires no configuration and is completely free, using system built-in voices",
diff --git a/lib/i18n/locales/ja-JP.json b/lib/i18n/locales/ja-JP.json
index 6d82ebf07..c789cb617 100644
--- a/lib/i18n/locales/ja-JP.json
+++ b/lib/i18n/locales/ja-JP.json
@@ -539,6 +539,7 @@
"providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)",
"providerBrowserNative": "ブラウザネイティブASR",
"providerQwenASR": "Qwen ASR(Alibaba Cloud百錬)",
+ "providerAzureASR": "Azure STT",
"providerUnpdf": "unpdf(組み込み)",
"providerMinerU": "MinerU",
"browserNativeTTSNote": "ブラウザネイティブTTSは設定不要で完全無料です。システム内蔵のボイスを使用します",
diff --git a/lib/i18n/locales/ru-RU.json b/lib/i18n/locales/ru-RU.json
index 9afa76e68..32a1230f6 100644
--- a/lib/i18n/locales/ru-RU.json
+++ b/lib/i18n/locales/ru-RU.json
@@ -539,6 +539,7 @@
"providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)",
"providerBrowserNative": "Встроенный ASR браузера",
"providerQwenASR": "Qwen ASR (Alibaba Cloud Bailian)",
+ "providerAzureASR": "Azure STT",
"providerUnpdf": "unpdf (встроенный)",
"providerMinerU": "MinerU",
"browserNativeTTSNote": "Встроенный TTS браузера не требует настройки и полностью бесплатен, использует системные голоса",
diff --git a/lib/i18n/locales/zh-CN.json b/lib/i18n/locales/zh-CN.json
index b2f7a2a79..d3f639a37 100644
--- a/lib/i18n/locales/zh-CN.json
+++ b/lib/i18n/locales/zh-CN.json
@@ -539,6 +539,7 @@
"providerOpenAIWhisper": "OpenAI ASR (gpt-4o-mini-transcribe)",
"providerBrowserNative": "浏览器原生 ASR",
"providerQwenASR": "Qwen ASR(阿里云百炼)",
+ "providerAzureASR": "Azure STT",
"providerUnpdf": "unpdf(内置)",
"providerMinerU": "MinerU",
"browserNativeTTSNote": "浏览器原生 TTS 无需配置,完全免费,使用系统内置语音",
diff --git a/lib/server/provider-config.ts b/lib/server/provider-config.ts
index 259208fde..fb4fce446 100644
--- a/lib/server/provider-config.ts
+++ b/lib/server/provider-config.ts
@@ -65,6 +65,7 @@ const TTS_ENV_MAP: Record = {
const ASR_ENV_MAP: Record = {
ASR_OPENAI: 'openai-whisper',
ASR_QWEN: 'qwen-asr',
+ ASR_AZURE: 'azure-asr',
};
const PDF_ENV_MAP: Record = {
diff --git a/lib/store/settings.ts b/lib/store/settings.ts
index 4b088bbc6..a68205e14 100644
--- a/lib/store/settings.ts
+++ b/lib/store/settings.ts
@@ -299,6 +299,7 @@ const getDefaultAudioConfig = () => ({
'openai-whisper': { apiKey: '', baseUrl: '', enabled: true },
'browser-native': { apiKey: '', baseUrl: '', enabled: true },
'qwen-asr': { apiKey: '', baseUrl: '', enabled: false },
+ 'azure-asr': { apiKey: '', baseUrl: '', enabled: false },
} as Record,
});