Skip to content

Commit 942de6b

Browse files
authored
feat(generate-transcription): opt-in verbose mode (#159)
* feat(generate-transcription): opt-in verbose mode * chore(generate-transcription): update test * chore(generate-transcription): clean snapshot
1 parent 341d2b0 commit 942de6b

File tree

3 files changed

+59
-17
lines changed

3 files changed

+59
-17
lines changed

packages/generate-transcription/src/index.ts

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,42 @@ import type { CommonRequestOptions } from '@xsai/shared'
22

33
import { requestHeaders, requestURL, responseJSON } from '@xsai/shared'
44

5-
export interface GenerateTranscriptionOptions<T extends GenerateTranscriptionOptionsTimeStampGranularities = undefined> extends CommonRequestOptions {
5+
export interface GenerateTranscriptionOptions<
6+
T1 extends GenerateTranscriptionOptionsResponseFormat,
7+
T2 extends GenerateTranscriptionOptionsTimeStampGranularities,
8+
> extends CommonRequestOptions {
69
file: Blob
710
fileName?: string
811
language?: string
912
prompt?: string
13+
/** @default `json` */
14+
responseFormat?: T1
1015
temperature?: string
1116
/** @default `segment` */
12-
timestampGranularities?: T
17+
timestampGranularities?: T2
1318
}
1419

20+
export type GenerateTranscriptionOptionsResponseFormat = 'json' | 'verbose_json' | undefined
21+
1522
export type GenerateTranscriptionOptionsTimeStampGranularities = 'segment' | 'word' | undefined
1623

17-
export interface GenerateTranscriptionResult<T extends GenerateTranscriptionOptionsTimeStampGranularities = undefined> {
18-
duration: number
19-
language: string
20-
segments: T extends 'word' ? never : GenerateTranscriptionResultSegment[]
24+
export interface GenerateTranscriptionResult<
25+
T1 extends GenerateTranscriptionOptionsResponseFormat,
26+
T2 extends GenerateTranscriptionOptionsTimeStampGranularities,
27+
> {
28+
duration: T1 extends 'verbose_json' ? number : never
29+
language: T1 extends 'verbose_json' ? string : never
30+
segments: T1 extends 'verbose_json'
31+
? T2 extends 'word'
32+
? never
33+
: GenerateTranscriptionResultSegment[]
34+
: never
2135
text: string
22-
words: T extends 'word' ? GenerateTranscriptionResultWord[] : never
36+
words: T1 extends 'verbose_json'
37+
? T2 extends 'word'
38+
? GenerateTranscriptionResultWord[]
39+
: never
40+
: never
2341
}
2442

2543
/** @see {@link https://platform.openai.com/docs/api-reference/audio/verbose-json-object#audio/verbose-json-object-segments} */
@@ -43,13 +61,20 @@ export interface GenerateTranscriptionResultWord {
4361
word: string
4462
}
4563

46-
export const generateTranscription = async <T extends GenerateTranscriptionOptionsTimeStampGranularities = undefined>(options: GenerateTranscriptionOptions<T>): Promise<GenerateTranscriptionResult<T>> => {
64+
export const generateTranscription = async <
65+
T1 extends GenerateTranscriptionOptionsResponseFormat = undefined,
66+
T2 extends GenerateTranscriptionOptionsTimeStampGranularities = undefined,
67+
>(options: GenerateTranscriptionOptions<T1, T2>): Promise<GenerateTranscriptionResult<T1, T2>> => {
4768
const body = new FormData()
4869

4970
body.append('model', options.model)
5071
body.append('file', options.file, options.fileName)
51-
body.append('response_format', 'verbose_json')
52-
body.append('timestamp_granularities[]', options.timestampGranularities ?? 'segment')
72+
body.append('response_format', options.responseFormat ?? 'json')
73+
74+
// make ts happy
75+
// eslint-disable-next-line ts/no-unnecessary-type-assertion
76+
if (options.responseFormat as GenerateTranscriptionOptionsResponseFormat === 'verbose_json')
77+
body.append('timestamp_granularities[]', options.timestampGranularities ?? 'segment')
5378

5479
if (options.language != null)
5580
body.append('language', options.language)
@@ -66,5 +91,5 @@ export const generateTranscription = async <T extends GenerateTranscriptionOptio
6691
method: 'POST',
6792
signal: options.abortSignal,
6893
})
69-
.then(responseJSON<GenerateTranscriptionResult<T>>)
94+
.then(responseJSON<GenerateTranscriptionResult<T1, T2>>)
7095
}

packages/generate-transcription/test/__snapshots__/index.test.ts.snap

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
22

3-
exports[`@xsai/generate-transcription > segment 1`] = `
3+
exports[`@xsai/generate-transcription > verbose_json + segment 1`] = `
44
[
55
{
66
"avg_logprob": -0.084375,
@@ -43,7 +43,7 @@ exports[`@xsai/generate-transcription > segment 1`] = `
4343
]
4444
`;
4545

46-
exports[`@xsai/generate-transcription > word 1`] = `
46+
exports[`@xsai/generate-transcription > verbose_json + word 1`] = `
4747
[
4848
{
4949
"end": 0.4,

packages/generate-transcription/test/index.test.ts

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,53 @@ import { describe, expect, it } from 'vitest'
44
import { generateTranscription } from '../src'
55

66
describe('@xsai/generate-transcription', () => {
7-
it('segment', async () => {
7+
const expectText = 'Hello, I am your AI assistant. Just let me know how I can help bring your ideas to life.'
8+
9+
it('json', async () => {
10+
const { text } = await generateTranscription({
11+
apiKey: 'a',
12+
baseURL: new URL('http://localhost:8000/v1/'),
13+
file: await openAsBlob('./test/fixtures/basic.wav', { type: 'audio/wav' }),
14+
fileName: 'basic.wav',
15+
language: 'en',
16+
model: 'deepdml/faster-whisper-large-v3-turbo-ct2',
17+
})
18+
19+
expect(text).toBe(expectText)
20+
})
21+
22+
it('verbose_json + segment', async () => {
823
const { duration, language, segments, text } = await generateTranscription({
924
apiKey: 'a',
1025
baseURL: new URL('http://localhost:8000/v1/'),
1126
file: await openAsBlob('./test/fixtures/basic.wav', { type: 'audio/wav' }),
1227
fileName: 'basic.wav',
1328
language: 'en',
1429
model: 'deepdml/faster-whisper-large-v3-turbo-ct2',
30+
responseFormat: 'verbose_json',
1531
})
1632

1733
expect(duration).toBe(5.472)
1834
expect(language).toBe('en')
19-
expect(text).toBe('Hello, I am your AI assistant. Just let me know how I can help bring your ideas to life.')
35+
expect(text).toBe(expectText)
2036
expect(segments).toMatchSnapshot()
2137
}, 30000)
2238

23-
it('word', async () => {
39+
it('verbose_json + word', async () => {
2440
const { duration, language, text, words } = await generateTranscription({
2541
apiKey: 'a',
2642
baseURL: new URL('http://localhost:8000/v1/'),
2743
file: await openAsBlob('./test/fixtures/basic.wav', { type: 'audio/wav' }),
2844
fileName: 'basic.wav',
2945
language: 'en',
3046
model: 'deepdml/faster-whisper-large-v3-turbo-ct2',
47+
responseFormat: 'verbose_json',
3148
timestampGranularities: 'word',
3249
})
3350

3451
expect(duration).toBe(5.472)
3552
expect(language).toBe('en')
36-
expect(text).toBe('Hello, I am your AI assistant. Just let me know how I can help bring your ideas to life.')
53+
expect(text).toBe(expectText)
3754
expect(words).toMatchSnapshot()
3855
}, 30000)
3956
})

0 commit comments

Comments
 (0)