Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<img src="https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/assemblyai.png?raw=true" width="500"/>

---
______________________________________________________________________

[![npm](https://img.shields.io/npm/v/assemblyai)](https://www.npmjs.com/package/assemblyai)
[![Test](https://github.com/AssemblyAI/assemblyai-node-sdk/actions/workflows/test.yml/badge.svg)](https://github.com/AssemblyAI/assemblyai-node-sdk/actions/workflows/test.yml)
Expand Down Expand Up @@ -101,7 +101,7 @@ let transcript = await client.transcripts.transcribe({
});
```

> [!TIP]
> [!NOTE]
> You can also pass a local file path, a stream, or a buffer as the `audio` property.

`transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
Expand All @@ -128,7 +128,7 @@ let transcript = await client.transcripts.transcribe({
});
```

> [!TIP]
> **Note:**
> You can also pass a file URL, a stream, or a buffer as the `audio` property.

`transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
Expand Down Expand Up @@ -224,7 +224,7 @@ do {
} while (previousPageUrl !== null);
```

> [!TIP]
> [!NOTE]
> To paginate over all pages, you need to use the `page.page_details.prev_url`
> because the transcripts are returned in descending order by creation date and time.
> The first page is are the most recent transcript, and each "previous" page are older transcripts.
Expand Down Expand Up @@ -263,9 +263,7 @@ const rt = client.streaming.transcriber({
> _Server code_:
>
> ```typescript
> const token = await client.streaming.createTemporaryToken({
> expires_in_seconds = 60,
> });
> const token = await client.streaming.createTemporaryToken({ expires_in_seconds = 60 });
> // TODO: return token to client
> ```
>
Expand All @@ -283,6 +281,7 @@ const rt = client.streaming.transcriber({
You can configure the following events.

<!-- prettier-ignore -->

```typescript
rt.on("open", ({ id, expires_at }) => console.log('Session ID:', id, 'Expires at:', expires_at));
rt.on("close", (code: number, reason: string) => console.log('Closed', code, reason));
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "assemblyai",
"version": "4.13.2",
"version": "4.13.3",
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
"engines": {
"node": ">=18"
Expand Down
80 changes: 80 additions & 0 deletions samples/speaker-diarization.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* Example of using speaker diarization with speaker_options
*
* Note: speaker_options and speakers_expected are mutually exclusive.
* Use either speakers_expected for simple guidance OR speaker_options for advanced control.
*/

import { AssemblyAI, SpeakerOptions } from "assemblyai";

// Replace with your API key
const client = new AssemblyAI({
apiKey: "YOUR_API_KEY",
});

async function transcribeWithSpeakerDiarization() {
// Example 1: Basic speaker diarization (uses smart defaults)
// The model automatically detects the optimal number of speakers
let transcript = await client.transcripts.transcribe({
audio: "https://example.com/audio.mp3",
speaker_labels: true,
});

console.log("Basic speaker diarization:", transcript.id);

// Example 2: Provide a hint with speakers_expected (smart default with guidance)
// Still uses smart defaults but gives the model a hint about expected speakers
transcript = await client.transcripts.transcribe({
audio: "https://example.com/audio.mp3",
speaker_labels: true,
speakers_expected: 3,
});

console.log("With expected speakers:", transcript.id);

// Example 3: Set boundaries with speaker_options (controlled smart defaults)
// Constrains the smart defaults to work within specified bounds
const speakerOptions: SpeakerOptions = {
min_speakers_expected: 2, // At least 2 speakers (overrides smart default if < 2)
max_speakers_expected: 4, // At most 4 speakers (overrides smart default if > 4)
};

transcript = await client.transcripts.transcribe({
audio: "https://example.com/audio.mp3",
speaker_labels: true,
speaker_options: speakerOptions,
});

console.log("With speaker options:", transcript.id);

// Note: The following would be INVALID since speakers_expected and speaker_options are mutually exclusive:
// transcript = await client.transcripts.transcribe({
// audio: "https://example.com/audio.mp3",
// speaker_labels: true,
// speakers_expected: 3, // ❌ Cannot use both
// speaker_options: { min_speakers_expected: 2 }, // ❌ Cannot use both
// });

// Example 4: Edge case handling for challenging audio
// Use speaker_options when you need precise control over speaker detection
transcript = await client.transcripts.transcribe({
audio: "https://example.com/audio.mp3",
speaker_labels: true,
speaker_options: {
min_speakers_expected: 1, // Handle solo speakers or presentations
max_speakers_expected: 10, // Handle large meetings or conferences
},
});

console.log("Edge case handling:", transcript.id);

// Access the utterances with speaker labels
if (transcript.status === "completed" && transcript.utterances) {
for (const utterance of transcript.utterances) {
console.log(`Speaker ${utterance.speaker}: ${utterance.text}`);
}
}
}

// Run the example
transcribeWithSpeakerDiarization().catch(console.error);
22 changes: 22 additions & 0 deletions src/types/openapi.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@
/**
* Only get throttled transcripts, overrides the status filter
* @defaultValue false
* @deprecated

Check warning on line 972 in src/types/openapi.generated.ts

View workflow job for this annotation

GitHub Actions / Node.js 20 on ubuntu-latest

tsdoc-missing-deprecation-message: The @deprecated block must include a deprecation message, e.g. describing the recommended alternative

Check warning on line 972 in src/types/openapi.generated.ts

View workflow job for this annotation

GitHub Actions / Node.js 18 on ubuntu-latest

tsdoc-missing-deprecation-message: The @deprecated block must include a deprecation message, e.g. describing the recommended alternative

Check warning on line 972 in src/types/openapi.generated.ts

View workflow job for this annotation

GitHub Actions / Node.js 18 on ubuntu-latest

tsdoc-missing-deprecation-message: The @deprecated block must include a deprecation message, e.g. describing the recommended alternative

Check warning on line 972 in src/types/openapi.generated.ts

View workflow job for this annotation

GitHub Actions / Node.js 20 on ubuntu-latest

tsdoc-missing-deprecation-message: The @deprecated block must include a deprecation message, e.g. describing the recommended alternative
*/
throttled_only?: boolean;
};
Expand Down Expand Up @@ -1413,6 +1413,20 @@
medium: number;
};

/**
* Advanced options for controlling speaker diarization parameters
*/
export type SpeakerOptions = {
/**
* Minimum number of speakers expected in the audio
*/
min_speakers_expected?: number | null;
/**
* Maximum number of speakers expected in the audio
*/
max_speakers_expected?: number | null;
};

/**
* The speech model to use for the transcription.
*/
Expand Down Expand Up @@ -2517,6 +2531,10 @@
* Tell the speaker label model how many speakers it should attempt to identify, up to 10. See {@link https://www.assemblyai.com/docs/models/speaker-diarization | Speaker diarization } for more details.
*/
speakers_expected?: number | null;
/**
* Advanced options for controlling speaker diarization parameters
*/
speaker_options?: SpeakerOptions | null;
/**
* The speech model used for the transcription. When `null`, the default model is used.
* @defaultValue "null
Expand Down Expand Up @@ -3039,6 +3057,10 @@
* @defaultValue "null
*/
speakers_expected?: number | null;
/**
* Advanced options for controlling speaker diarization parameters
*/
speaker_options?: SpeakerOptions | null;
/**
* The speech model to use for the transcription. When `null`, the "best" model is used.
* @defaultValue best
Expand Down
130 changes: 130 additions & 0 deletions tests/unit/speaker-options.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import fetchMock from "jest-fetch-mock";
import { SpeakerOptions } from "../../src";
import {
createClient,
requestMatches,
} from "./utils";

fetchMock.enableMocks();

const assembly = createClient();
const transcriptId = "transcript_123";
const remoteAudioURL = "https://assembly.ai/espn.m4a";

beforeEach(() => {
jest.clearAllMocks();
fetchMock.resetMocks();
fetchMock.doMock();
});

describe("speaker options", () => {
it("should create transcript with speaker_options", async () => {
const speakerOptions: SpeakerOptions = {
min_speakers_expected: 2,
max_speakers_expected: 4,
};

fetchMock.doMockOnceIf(
requestMatches({ url: "/v2/transcript", method: "POST" }),
JSON.stringify({ id: transcriptId, status: "queued" }),
);

const transcript = await assembly.transcripts.submit({
audio_url: remoteAudioURL,
speaker_labels: true,
speaker_options: speakerOptions,
});

expect(transcript.id).toBe(transcriptId);
expect(transcript.status).toBe("queued");

// Verify the request body included speaker_options
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
expect(requestBody.speaker_labels).toBe(true);
expect(requestBody.speaker_options).toEqual(speakerOptions);
});

it("should create transcript with only min_speakers_expected", async () => {
const speakerOptions: SpeakerOptions = {
min_speakers_expected: 3,
};

fetchMock.doMockOnceIf(
requestMatches({ url: "/v2/transcript", method: "POST" }),
JSON.stringify({ id: transcriptId, status: "queued" }),
);

const transcript = await assembly.transcripts.submit({
audio_url: remoteAudioURL,
speaker_labels: true,
speaker_options: speakerOptions,
});

expect(transcript.id).toBe(transcriptId);

const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
expect(requestBody.speaker_options.min_speakers_expected).toBe(3);
expect(requestBody.speaker_options.max_speakers_expected).toBeUndefined();
});

it("should create transcript with only max_speakers_expected", async () => {
const speakerOptions: SpeakerOptions = {
max_speakers_expected: 5,
};

fetchMock.doMockOnceIf(
requestMatches({ url: "/v2/transcript", method: "POST" }),
JSON.stringify({ id: transcriptId, status: "queued" }),
);

const transcript = await assembly.transcripts.submit({
audio_url: remoteAudioURL,
speaker_labels: true,
speaker_options: speakerOptions,
});

expect(transcript.id).toBe(transcriptId);

const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
expect(requestBody.speaker_options.min_speakers_expected).toBeUndefined();
expect(requestBody.speaker_options.max_speakers_expected).toBe(5);
});

it("should create transcript with speakers_expected (without speaker_options)", async () => {
fetchMock.doMockOnceIf(
requestMatches({ url: "/v2/transcript", method: "POST" }),
JSON.stringify({ id: transcriptId, status: "queued" }),
);

const transcript = await assembly.transcripts.submit({
audio_url: remoteAudioURL,
speaker_labels: true,
speakers_expected: 3,
});

expect(transcript.id).toBe(transcriptId);

const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
expect(requestBody.speaker_labels).toBe(true);
expect(requestBody.speakers_expected).toBe(3);
expect(requestBody.speaker_options).toBeUndefined();
});

it("should handle null speaker_options", async () => {
fetchMock.doMockOnceIf(
requestMatches({ url: "/v2/transcript", method: "POST" }),
JSON.stringify({ id: transcriptId, status: "queued" }),
);

const transcript = await assembly.transcripts.submit({
audio_url: remoteAudioURL,
speaker_labels: true,
speaker_options: null,
});

expect(transcript.id).toBe(transcriptId);

const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
expect(requestBody.speaker_options).toBe(null);
});
});
Loading