Skip to content

Commit 0857788

Browse files
authored
fix(provider/groq): experimental_transcribe fails with valid Buffer (#8159)
## Background `experimental_transcribe` fails with valid Buffer input after upgrading to [email protected] ## Summary [Set `filename`](https://developer.mozilla.org/en-US/docs/Web/API/FormData/append#filename) when uploading an audio blob to Groq's transcribe API ## Manual Verification - https://github.com/firstloophq/ai-sdk-audio-bug-repro ## Tasks <!-- This task list is intended to help you keep track of what you need to do. Feel free to add tasks and remove unnecessary tasks or this section as needed. Please check if the PR fulfills the following requirements: --> - [x] Tests have been added / updated (for bug fixes / features) - [x] Documentation has been added / updated (for bug fixes / features) - [x] A _patch_ changeset for relevant packages has been added (for bug fixes / features - run `pnpm changeset` in the project root) - [x] Formatting issues have been fixed (run `pnpm prettier-fix` in the project root) ## Future Work Apply the same fix to other places where we upload a blob without defining a file name ``` packages/elevenlabs/src/elevenlabs-transcription-model.ts: 75 formData.append('model_id', this.modelId); 76: formData.append('file', new File([blob], 'audio', { type: mediaType })); 77 formData.append('diarize', 'true'); packages/gladia/src/gladia-transcription-model.ts: 502 'audio', 503: new File([blob], 'audio', { type: options.mediaType }), 504 ); packages/openai/src/transcription/openai-transcription-model.ts: 131 formData.append('model', this.modelId); 132: formData.append('file', new File([blob], 'audio', { type: mediaType })); 133 packages/revai/src/revai-transcription-model.ts: 254 255: formData.append('media', new File([blob], 'audio', { type: mediaType })); 256 const transcriptionModelOptions: RevaiTranscriptionAPITypes = { ``` ## Related Issues Closes #6413 Closes #7757
1 parent 8473978 commit 0857788

File tree

6 files changed

+89
-20
lines changed

6 files changed

+89
-20
lines changed

.changeset/smooth-impalas-remain.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
'@ai-sdk/provider-utils': patch
3+
'@ai-sdk/groq': patch
4+
---
5+
6+
fix(provider/groq): `experimental_transcribe` fails with valid Buffer

examples/ai-core/src/transcribe/groq-string.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ async function main() {
77
const result = await transcribe({
88
model: groq.transcription('whisper-large-v3-turbo'),
99
audio: Buffer.from(await readFile('./data/galileo.mp3')).toString('base64'),
10+
providerOptions: {
11+
groq: {
12+
responseFormat: 'verbose_json',
13+
},
14+
},
1015
});
1116

1217
console.log('Text:', result.text);

packages/groq/src/groq-transcription-model.ts

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
combineHeaders,
77
convertBase64ToUint8Array,
88
createJsonResponseHandler,
9+
mediaTypeToExtension,
910
parseProviderOptions,
1011
postFormDataToApi,
1112
} from '@ai-sdk/provider-utils';
@@ -68,7 +69,12 @@ export class GroqTranscriptionModel implements TranscriptionModelV2 {
6869
: new Blob([convertBase64ToUint8Array(audio)]);
6970

7071
formData.append('model', this.modelId);
71-
formData.append('file', new File([blob], 'audio', { type: mediaType }));
72+
const fileExtension = mediaTypeToExtension(mediaType);
73+
formData.append(
74+
'file',
75+
new File([blob], 'audio', { type: mediaType }),
76+
`audio.${fileExtension}`,
77+
);
7278

7379
// Add provider-specific options
7480
if (groqOptions) {
@@ -134,8 +140,8 @@ export class GroqTranscriptionModel implements TranscriptionModelV2 {
134140
startSecond: segment.start,
135141
endSecond: segment.end,
136142
})) ?? [],
137-
language: response.language,
138-
durationInSeconds: response.duration,
143+
language: response.language ?? undefined,
144+
durationInSeconds: response.duration ?? undefined,
139145
warnings,
140146
response: {
141147
timestamp: currentDate,
@@ -148,25 +154,28 @@ export class GroqTranscriptionModel implements TranscriptionModelV2 {
148154
}
149155

150156
const groqTranscriptionResponseSchema = z.object({
151-
task: z.string(),
152-
language: z.string(),
153-
duration: z.number(),
154157
text: z.string(),
155-
segments: z.array(
156-
z.object({
157-
id: z.number(),
158-
seek: z.number(),
159-
start: z.number(),
160-
end: z.number(),
161-
text: z.string(),
162-
tokens: z.array(z.number()),
163-
temperature: z.number(),
164-
avg_logprob: z.number(),
165-
compression_ratio: z.number(),
166-
no_speech_prob: z.number(),
167-
}),
168-
),
169158
x_groq: z.object({
170159
id: z.string(),
171160
}),
161+
// additional properties are returned when `response_format: 'verbose_json'` is
162+
task: z.string().nullish(),
163+
language: z.string().nullish(),
164+
duration: z.number().nullish(),
165+
segments: z
166+
.array(
167+
z.object({
168+
id: z.number(),
169+
seek: z.number(),
170+
start: z.number(),
171+
end: z.number(),
172+
text: z.string(),
173+
tokens: z.array(z.number()),
174+
temperature: z.number(),
175+
avg_logprob: z.number(),
176+
compression_ratio: z.number(),
177+
no_speech_prob: z.number(),
178+
}),
179+
)
180+
.nullish(),
172181
});

packages/provider-utils/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ export { isUrlSupported } from './is-url-supported';
1212
export * from './load-api-key';
1313
export { loadOptionalSetting } from './load-optional-setting';
1414
export { loadSetting } from './load-setting';
15+
export { mediaTypeToExtension } from './media-type-to-extension';
1516
export * from './parse-json';
1617
export { parseJsonEventStream } from './parse-json-event-stream';
1718
export { parseProviderOptions } from './parse-provider-options';
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import { describe, it, expect } from 'vitest';
2+
import { mediaTypeToExtension } from './media-type-to-extension';
3+
4+
describe('mediaTypeToExtension()', () => {
5+
it.each([
6+
// most common
7+
['audio/mpeg', 'mp3'],
8+
['audio/mp3', 'mp3'],
9+
['audio/wav', 'wav'],
10+
['audio/x-wav', 'wav'],
11+
['audio/webm', 'webm'],
12+
['audio/ogg', 'ogg'],
13+
['audio/opus', 'ogg'],
14+
['audio/mp4', 'm4a'],
15+
['audio/x-m4a', 'm4a'],
16+
['audio/flac', 'flac'],
17+
['audio/aac', 'aac'],
18+
// upper case
19+
['AUDIO/MPEG', 'mp3'],
20+
['AUDIO/MP3', 'mp3'],
21+
// invalid
22+
['nope', ''],
23+
])('should map %s to %s', (mediaType, expectedExtension) => {
24+
expect(mediaTypeToExtension(mediaType)).toBe(expectedExtension);
25+
});
26+
});
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/**
2+
* Maps a media type to its corresponding file extension.
3+
* It was originally introduced to set a filename for audio file uploads
4+
* in https://github.com/vercel/ai/pull/8159.
5+
*
6+
* @param mediaType The media type to map.
7+
* @returns The corresponding file extension
8+
* @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/MIME_types/Common_types
9+
*/
10+
export function mediaTypeToExtension(mediaType: string) {
11+
const [_type, subtype = ''] = mediaType.toLowerCase().split('/');
12+
13+
return (
14+
{
15+
mpeg: 'mp3',
16+
'x-wav': 'wav',
17+
opus: 'ogg',
18+
mp4: 'm4a',
19+
'x-m4a': 'm4a',
20+
}[subtype] ?? subtype
21+
);
22+
}

0 commit comments

Comments
 (0)