diff --git a/README.md b/README.md index 644cf48..a4aa9b3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ---- +______________________________________________________________________ [![npm](https://img.shields.io/npm/v/assemblyai)](https://www.npmjs.com/package/assemblyai) [![Test](https://github.com/AssemblyAI/assemblyai-node-sdk/actions/workflows/test.yml/badge.svg)](https://github.com/AssemblyAI/assemblyai-node-sdk/actions/workflows/test.yml) @@ -101,7 +101,7 @@ let transcript = await client.transcripts.transcribe({ }); ``` -> [!TIP] +> [!NOTE] > You can also pass a local file path, a stream, or a buffer as the `audio` property. `transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`. @@ -128,7 +128,7 @@ let transcript = await client.transcripts.transcribe({ }); ``` -> [!TIP] +> **Note:** > You can also pass a file URL, a stream, or a buffer as the `audio` property. `transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`. @@ -224,7 +224,7 @@ do { } while (previousPageUrl !== null); ``` -> [!TIP] +> [!NOTE] > To paginate over all pages, you need to use the `page.page_details.prev_url` > because the transcripts are returned in descending order by creation date and time. > The first page is are the most recent transcript, and each "previous" page are older transcripts. @@ -263,9 +263,7 @@ const rt = client.streaming.transcriber({ > _Server code_: > > ```typescript -> const token = await client.streaming.createTemporaryToken({ -> expires_in_seconds = 60, -> }); +> const token = await client.streaming.createTemporaryToken({ expires_in_seconds = 60 }); > // TODO: return token to client > ``` > @@ -283,6 +281,7 @@ const rt = client.streaming.transcriber({ You can configure the following events. + ```typescript rt.on("open", ({ id, expires_at }) => console.log('Session ID:', id, 'Expires at:', expires_at)); rt.on("close", (code: number, reason: string) => console.log('Closed', code, reason)); diff --git a/package.json b/package.json index 741a76c..03037dd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "assemblyai", - "version": "4.13.2", + "version": "4.13.3", "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.", "engines": { "node": ">=18" diff --git a/samples/speaker-diarization.ts b/samples/speaker-diarization.ts new file mode 100644 index 0000000..dfb218e --- /dev/null +++ b/samples/speaker-diarization.ts @@ -0,0 +1,80 @@ +/** + * Example of using speaker diarization with speaker_options + * + * Note: speaker_options and speakers_expected are mutually exclusive. + * Use either speakers_expected for simple guidance OR speaker_options for advanced control. + */ + +import { AssemblyAI, SpeakerOptions } from "assemblyai"; + +// Replace with your API key +const client = new AssemblyAI({ + apiKey: "YOUR_API_KEY", +}); + +async function transcribeWithSpeakerDiarization() { + // Example 1: Basic speaker diarization (uses smart defaults) + // The model automatically detects the optimal number of speakers + let transcript = await client.transcripts.transcribe({ + audio: "https://example.com/audio.mp3", + speaker_labels: true, + }); + + console.log("Basic speaker diarization:", transcript.id); + + // Example 2: Provide a hint with speakers_expected (smart default with guidance) + // Still uses smart defaults but gives the model a hint about expected speakers + transcript = await client.transcripts.transcribe({ + audio: "https://example.com/audio.mp3", + speaker_labels: true, + speakers_expected: 3, + }); + + console.log("With expected speakers:", transcript.id); + + // Example 3: Set boundaries with speaker_options (controlled smart defaults) + // Constrains the smart defaults to work within specified bounds + const speakerOptions: SpeakerOptions = { + min_speakers_expected: 2, // At least 2 speakers (overrides smart default if < 2) + max_speakers_expected: 4, // At most 4 speakers (overrides smart default if > 4) + }; + + transcript = await client.transcripts.transcribe({ + audio: "https://example.com/audio.mp3", + speaker_labels: true, + speaker_options: speakerOptions, + }); + + console.log("With speaker options:", transcript.id); + + // Note: The following would be INVALID since speakers_expected and speaker_options are mutually exclusive: + // transcript = await client.transcripts.transcribe({ + // audio: "https://example.com/audio.mp3", + // speaker_labels: true, + // speakers_expected: 3, // ❌ Cannot use both + // speaker_options: { min_speakers_expected: 2 }, // ❌ Cannot use both + // }); + + // Example 4: Edge case handling for challenging audio + // Use speaker_options when you need precise control over speaker detection + transcript = await client.transcripts.transcribe({ + audio: "https://example.com/audio.mp3", + speaker_labels: true, + speaker_options: { + min_speakers_expected: 1, // Handle solo speakers or presentations + max_speakers_expected: 10, // Handle large meetings or conferences + }, + }); + + console.log("Edge case handling:", transcript.id); + + // Access the utterances with speaker labels + if (transcript.status === "completed" && transcript.utterances) { + for (const utterance of transcript.utterances) { + console.log(`Speaker ${utterance.speaker}: ${utterance.text}`); + } + } +} + +// Run the example +transcribeWithSpeakerDiarization().catch(console.error); diff --git a/src/types/openapi.generated.ts b/src/types/openapi.generated.ts index 29174b5..27423a5 100644 --- a/src/types/openapi.generated.ts +++ b/src/types/openapi.generated.ts @@ -1413,6 +1413,20 @@ export type SeverityScoreSummary = { medium: number; }; +/** + * Advanced options for controlling speaker diarization parameters + */ +export type SpeakerOptions = { + /** + * Minimum number of speakers expected in the audio + */ + min_speakers_expected?: number | null; + /** + * Maximum number of speakers expected in the audio + */ + max_speakers_expected?: number | null; +}; + /** * The speech model to use for the transcription. */ @@ -2517,6 +2531,10 @@ export type Transcript = { * Tell the speaker label model how many speakers it should attempt to identify, up to 10. See {@link https://www.assemblyai.com/docs/models/speaker-diarization | Speaker diarization } for more details. */ speakers_expected?: number | null; + /** + * Advanced options for controlling speaker diarization parameters + */ + speaker_options?: SpeakerOptions | null; /** * The speech model used for the transcription. When `null`, the default model is used. * @defaultValue "null @@ -3039,6 +3057,10 @@ export type TranscriptOptionalParams = { * @defaultValue "null */ speakers_expected?: number | null; + /** + * Advanced options for controlling speaker diarization parameters + */ + speaker_options?: SpeakerOptions | null; /** * The speech model to use for the transcription. When `null`, the "best" model is used. * @defaultValue best diff --git a/tests/unit/speaker-options.test.ts b/tests/unit/speaker-options.test.ts new file mode 100644 index 0000000..be49c11 --- /dev/null +++ b/tests/unit/speaker-options.test.ts @@ -0,0 +1,130 @@ +import fetchMock from "jest-fetch-mock"; +import { SpeakerOptions } from "../../src"; +import { + createClient, + requestMatches, +} from "./utils"; + +fetchMock.enableMocks(); + +const assembly = createClient(); +const transcriptId = "transcript_123"; +const remoteAudioURL = "https://assembly.ai/espn.m4a"; + +beforeEach(() => { + jest.clearAllMocks(); + fetchMock.resetMocks(); + fetchMock.doMock(); +}); + +describe("speaker options", () => { + it("should create transcript with speaker_options", async () => { + const speakerOptions: SpeakerOptions = { + min_speakers_expected: 2, + max_speakers_expected: 4, + }; + + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + speaker_labels: true, + speaker_options: speakerOptions, + }); + + expect(transcript.id).toBe(transcriptId); + expect(transcript.status).toBe("queued"); + + // Verify the request body included speaker_options + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.speaker_labels).toBe(true); + expect(requestBody.speaker_options).toEqual(speakerOptions); + }); + + it("should create transcript with only min_speakers_expected", async () => { + const speakerOptions: SpeakerOptions = { + min_speakers_expected: 3, + }; + + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + speaker_labels: true, + speaker_options: speakerOptions, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.speaker_options.min_speakers_expected).toBe(3); + expect(requestBody.speaker_options.max_speakers_expected).toBeUndefined(); + }); + + it("should create transcript with only max_speakers_expected", async () => { + const speakerOptions: SpeakerOptions = { + max_speakers_expected: 5, + }; + + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + speaker_labels: true, + speaker_options: speakerOptions, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.speaker_options.min_speakers_expected).toBeUndefined(); + expect(requestBody.speaker_options.max_speakers_expected).toBe(5); + }); + + it("should create transcript with speakers_expected (without speaker_options)", async () => { + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + speaker_labels: true, + speakers_expected: 3, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.speaker_labels).toBe(true); + expect(requestBody.speakers_expected).toBe(3); + expect(requestBody.speaker_options).toBeUndefined(); + }); + + it("should handle null speaker_options", async () => { + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + speaker_labels: true, + speaker_options: null, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.speaker_options).toBe(null); + }); +});