Skip to content

Commit 21a9c42

Browse files
authored
Merge cb546f5 into 3bb3f36
2 parents 3bb3f36 + cb546f5 commit 21a9c42

14 files changed

+1069
-10
lines changed

common/api-review/ai.api.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ export class ArraySchema extends Schema {
8585
toJSON(): SchemaRequest;
8686
}
8787

88+
// @beta
89+
export interface AudioConversationController {
90+
stop: () => Promise<void>;
91+
}
92+
8893
// @public
8994
export abstract class Backend {
9095
protected constructor(type: BackendType);
@@ -710,7 +715,7 @@ export interface LiveGenerationConfig {
710715
frequencyPenalty?: number;
711716
maxOutputTokens?: number;
712717
presencePenalty?: number;
713-
responseModalities?: [ResponseModality];
718+
responseModalities?: ResponseModality[];
714719
speechConfig?: SpeechConfig;
715720
temperature?: number;
716721
topK?: number;
@@ -787,6 +792,7 @@ export class LiveSession {
787792
// @internal
788793
constructor(webSocketHandler: WebSocketHandler, serverMessages: AsyncGenerator<unknown>);
789794
close(): Promise<void>;
795+
inConversation: boolean;
790796
isClosed: boolean;
791797
receive(): AsyncGenerator<LiveServerContent | LiveServerToolCall | LiveServerToolCallCancellation>;
792798
send(request: string | Array<string | Part>, turnComplete?: boolean): Promise<void>;
@@ -860,7 +866,7 @@ export const POSSIBLE_ROLES: readonly ["user", "model", "function", "system"];
860866

861867
// @beta
862868
export interface PrebuiltVoiceConfig {
863-
voiceConfig?: string;
869+
voiceName?: string;
864870
}
865871

866872
// @public
@@ -882,6 +888,7 @@ export interface RequestOptions {
882888
export const ResponseModality: {
883889
readonly TEXT: "TEXT";
884890
readonly IMAGE: "IMAGE";
891+
readonly AUDIO: "AUDIO";
885892
};
886893

887894
// @beta
@@ -1031,6 +1038,14 @@ export interface SpeechConfig {
10311038
voiceConfig?: VoiceConfig;
10321039
}
10331040

1041+
// @beta
1042+
export function startAudioConversation(liveSession: LiveSession, options?: StartAudioConversationOptions): Promise<AudioConversationController>;
1043+
1044+
// @beta
1045+
export interface StartAudioConversationOptions {
1046+
functionCallingHandler?: (functionCalls: LiveServerToolCall['functionCalls']) => Promise<Part>;
1047+
}
1048+
10341049
// @public
10351050
export interface StartChatParams extends BaseParams {
10361051
// (undocumented)

docs-devsite/_toc.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ toc:
1616
path: /docs/reference/js/ai.anyofschema.md
1717
- title: ArraySchema
1818
path: /docs/reference/js/ai.arrayschema.md
19+
- title: AudioConversationController
20+
path: /docs/reference/js/ai.audioconversationcontroller.md
1921
- title: Backend
2022
path: /docs/reference/js/ai.backend.md
2123
- title: BaseParams
@@ -160,6 +162,8 @@ toc:
160162
path: /docs/reference/js/ai.segment.md
161163
- title: SpeechConfig
162164
path: /docs/reference/js/ai.speechconfig.md
165+
- title: StartAudioConversationOptions
166+
path: /docs/reference/js/ai.startaudioconversationoptions.md
163167
- title: StartChatParams
164168
path: /docs/reference/js/ai.startchatparams.md
165169
- title: StringSchema
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
Project: /docs/reference/js/_project.yaml
2+
Book: /docs/reference/_book.yaml
3+
page_type: reference
4+
5+
{% comment %}
6+
DO NOT EDIT THIS FILE!
7+
This is generated by the JS SDK team, and any local changes will be
8+
overwritten. Changes should be made in the source code at
9+
https://github.com/firebase/firebase-js-sdk
10+
{% endcomment %}
11+
12+
# AudioConversationController interface
13+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
14+
>
15+
16+
A controller for managing an active audio conversation.
17+
18+
<b>Signature:</b>
19+
20+
```typescript
21+
export interface AudioConversationController
22+
```
23+
24+
## Properties
25+
26+
| Property | Type | Description |
27+
| --- | --- | --- |
28+
| [stop](./ai.audioconversationcontroller.md#audioconversationcontrollerstop) | () =&gt; Promise&lt;void&gt; | <b><i>(Public Preview)</i></b> Stops the audio conversation, closes the microphone connection, and cleans up resources. Returns a promise that resolves when cleanup is complete. |
29+
30+
## AudioConversationController.stop
31+
32+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
33+
>
34+
35+
Stops the audio conversation, closes the microphone connection, and cleans up resources. Returns a promise that resolves when cleanup is complete.
36+
37+
<b>Signature:</b>
38+
39+
```typescript
40+
stop: () => Promise<void>;
41+
```

docs-devsite/ai.livegenerationconfig.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ export interface LiveGenerationConfig
2828
| [frequencyPenalty](./ai.livegenerationconfig.md#livegenerationconfigfrequencypenalty) | number | <b><i>(Public Preview)</i></b> Frequency penalties. |
2929
| [maxOutputTokens](./ai.livegenerationconfig.md#livegenerationconfigmaxoutputtokens) | number | <b><i>(Public Preview)</i></b> Specifies the maximum number of tokens that can be generated in the response. The number of tokens per word varies depending on the language outputted. Is unbounded by default. |
3030
| [presencePenalty](./ai.livegenerationconfig.md#livegenerationconfigpresencepenalty) | number | <b><i>(Public Preview)</i></b> Positive penalties. |
31-
| [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | \[[ResponseModality](./ai.md#responsemodality)<!-- -->\] | <b><i>(Public Preview)</i></b> The modalities of the response. |
31+
| [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | [ResponseModality](./ai.md#responsemodality)<!-- -->\[\] | <b><i>(Public Preview)</i></b> The modalities of the response. |
3232
| [speechConfig](./ai.livegenerationconfig.md#livegenerationconfigspeechconfig) | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | <b><i>(Public Preview)</i></b> Configuration for speech synthesis. |
3333
| [temperature](./ai.livegenerationconfig.md#livegenerationconfigtemperature) | number | <b><i>(Public Preview)</i></b> Controls the degree of randomness in token selection. A <code>temperature</code> value of 0 means that the highest probability tokens are always selected. In this case, responses for a given prompt are mostly deterministic, but a small amount of variation is still possible. |
3434
| [topK](./ai.livegenerationconfig.md#livegenerationconfigtopk) | number | <b><i>(Public Preview)</i></b> Changes how the model selects token for output. A <code>topK</code> value of 1 means the select token is the most probable among all tokens in the model's vocabulary, while a <code>topK</code> value 3 means that the next token is selected from among the 3 most probably using probabilities sampled. Tokens are then further filtered with the highest selected <code>temperature</code> sampling. Defaults to 40 if unspecified. |
@@ -83,7 +83,7 @@ The modalities of the response.
8383
<b>Signature:</b>
8484

8585
```typescript
86-
responseModalities?: [ResponseModality];
86+
responseModalities?: ResponseModality[];
8787
```
8888

8989
## LiveGenerationConfig.speechConfig

docs-devsite/ai.livesession.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ export declare class LiveSession
2929

3030
| Property | Modifiers | Type | Description |
3131
| --- | --- | --- | --- |
32+
| [inConversation](./ai.livesession.md#livesessioninconversation) | | boolean | <b><i>(Public Preview)</i></b> Indicates whether this Live session is being controlled by an <code>AudioConversationController</code>. |
3233
| [isClosed](./ai.livesession.md#livesessionisclosed) | | boolean | <b><i>(Public Preview)</i></b> Indicates whether this Live session is closed. |
3334

3435
## Methods
@@ -41,6 +42,19 @@ export declare class LiveSession
4142
| [sendMediaChunks(mediaChunks)](./ai.livesession.md#livesessionsendmediachunks) | | <b><i>(Public Preview)</i></b> Sends realtime input to the server. |
4243
| [sendMediaStream(mediaChunkStream)](./ai.livesession.md#livesessionsendmediastream) | | <b><i>(Public Preview)</i></b> Sends a stream of [GenerativeContentBlob](./ai.generativecontentblob.md#generativecontentblob_interface)<!-- -->. |
4344

45+
## LiveSession.inConversation
46+
47+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
48+
>
49+
50+
Indicates whether this Live session is being controlled by an `AudioConversationController`<!-- -->.
51+
52+
<b>Signature:</b>
53+
54+
```typescript
55+
inConversation: boolean;
56+
```
57+
4458
## LiveSession.isClosed
4559

4660
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.

docs-devsite/ai.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ The Firebase AI Web SDK.
2222
| [getGenerativeModel(ai, modelParams, requestOptions)](./ai.md#getgenerativemodel_80bd839) | Returns a [GenerativeModel](./ai.generativemodel.md#generativemodel_class) class with methods for inference and other functionality. |
2323
| [getImagenModel(ai, modelParams, requestOptions)](./ai.md#getimagenmodel_e1f6645) | <b><i>(Public Preview)</i></b> Returns an [ImagenModel](./ai.imagenmodel.md#imagenmodel_class) class with methods for using Imagen.<!-- -->Only Imagen 3 models (named <code>imagen-3.0-*</code>) are supported. |
2424
| [getLiveGenerativeModel(ai, modelParams)](./ai.md#getlivegenerativemodel_f2099ac) | <b><i>(Public Preview)</i></b> Returns a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) class for real-time, bidirectional communication.<!-- -->The Live API is only supported in modern browser windows and Node &gt;<!-- -->= 22. |
25+
| <b>function(liveSession, ...)</b> |
26+
| [startAudioConversation(liveSession, options)](./ai.md#startaudioconversation_01c8e7f) | <b><i>(Public Preview)</i></b> Starts a real-time, bidirectional audio conversation with the model. This helper function manages the complexities of microphone access, audio recording, playback, and interruptions. |
2527

2628
## Classes
2729

@@ -53,6 +55,7 @@ The Firebase AI Web SDK.
5355
| --- | --- |
5456
| [AI](./ai.ai.md#ai_interface) | An instance of the Firebase AI SDK.<!-- -->Do not create this instance directly. Instead, use [getAI()](./ai.md#getai_a94a413)<!-- -->. |
5557
| [AIOptions](./ai.aioptions.md#aioptions_interface) | Options for initializing the AI service using [getAI()](./ai.md#getai_a94a413)<!-- -->. This allows specifying which backend to use (Vertex AI Gemini API or Gemini Developer API) and configuring its specific options (like location for Vertex AI). |
58+
| [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface) | <b><i>(Public Preview)</i></b> A controller for managing an active audio conversation. |
5659
| [BaseParams](./ai.baseparams.md#baseparams_interface) | Base parameters for a number of methods. |
5760
| [Citation](./ai.citation.md#citation_interface) | A single citation. |
5861
| [CitationMetadata](./ai.citationmetadata.md#citationmetadata_interface) | Citation metadata that may be found on a [GenerateContentCandidate](./ai.generatecontentcandidate.md#generatecontentcandidate_interface)<!-- -->. |
@@ -112,6 +115,7 @@ The Firebase AI Web SDK.
112115
| [SearchEntrypoint](./ai.searchentrypoint.md#searchentrypoint_interface) | Google search entry point. |
113116
| [Segment](./ai.segment.md#segment_interface) | Represents a specific segment within a [Content](./ai.content.md#content_interface) object, often used to pinpoint the exact location of text or data that grounding information refers to. |
114117
| [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | <b><i>(Public Preview)</i></b> Configures speech synthesis. |
118+
| [StartAudioConversationOptions](./ai.startaudioconversationoptions.md#startaudioconversationoptions_interface) | <b><i>(Public Preview)</i></b> Options for [startAudioConversation()](./ai.md#startaudioconversation_01c8e7f)<!-- -->. |
115119
| [StartChatParams](./ai.startchatparams.md#startchatparams_interface) | Params for [GenerativeModel.startChat()](./ai.generativemodel.md#generativemodelstartchat)<!-- -->. |
116120
| [TextPart](./ai.textpart.md#textpart_interface) | Content part interface if the part represents a text string. |
117121
| [ThinkingConfig](./ai.thinkingconfig.md#thinkingconfig_interface) | Configuration for "thinking" behavior of compatible Gemini models.<!-- -->Certain models utilize a thinking process before generating a response. This allows them to reason through complex problems and plan a more coherent and accurate answer. |
@@ -307,6 +311,76 @@ export declare function getLiveGenerativeModel(ai: AI, modelParams: LiveModelPar
307311

308312
If the `apiKey` or `projectId` fields are missing in your Firebase config.
309313

314+
## function(liveSession, ...)
315+
316+
### startAudioConversation(liveSession, options) {:#startaudioconversation_01c8e7f}
317+
318+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
319+
>
320+
321+
Starts a real-time, bidirectional audio conversation with the model. This helper function manages the complexities of microphone access, audio recording, playback, and interruptions.
322+
323+
Important: This function must be called in response to a user gesture (for example, a button click) to comply with [browser autoplay policies](https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Best_practices#autoplay_policy)<!-- -->.
324+
325+
<b>Signature:</b>
326+
327+
```typescript
328+
export declare function startAudioConversation(liveSession: LiveSession, options?: StartAudioConversationOptions): Promise<AudioConversationController>;
329+
```
330+
331+
#### Parameters
332+
333+
| Parameter | Type | Description |
334+
| --- | --- | --- |
335+
| liveSession | [LiveSession](./ai.livesession.md#livesession_class) | An active [LiveSession](./ai.livesession.md#livesession_class) instance. |
336+
| options | [StartAudioConversationOptions](./ai.startaudioconversationoptions.md#startaudioconversationoptions_interface) | Configuration options for the audio conversation. |
337+
338+
<b>Returns:</b>
339+
340+
Promise&lt;[AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface)<!-- -->&gt;
341+
342+
A `Promise` that resolves with an [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface)<!-- -->.
343+
344+
#### Exceptions
345+
346+
`AIError` if the environment does not support required Web APIs (`UNSUPPORTED`<!-- -->), if a conversation is already active (`REQUEST_ERROR`<!-- -->), the session is closed (`SESSION_CLOSED`<!-- -->), or if an unexpected initialization error occurs (`ERROR`<!-- -->).
347+
348+
`DOMException` Thrown by `navigator.mediaDevices.getUserMedia()` if issues occur with microphone access, such as permissions being denied (`NotAllowedError`<!-- -->) or no compatible hardware being found (`NotFoundError`<!-- -->). See the [MDN documentation](https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia#exceptions) for a full list of exceptions.
349+
350+
### Example
351+
352+
353+
```javascript
354+
const liveSession = await model.connect();
355+
let conversationController;
356+
357+
// This function must be called from within a click handler.
358+
async function startConversation() {
359+
try {
360+
conversationController = await startAudioConversation(liveSession);
361+
} catch (e) {
362+
// Handle AI-specific errors
363+
if (e instanceof AIError) {
364+
console.error("AI Error:", e.message);
365+
}
366+
// Handle microphone permission and hardware errors
367+
else if (e instanceof DOMException) {
368+
console.error("Microphone Error:", e.message);
369+
}
370+
// Handle other unexpected errors
371+
else {
372+
console.error("An unexpected error occurred:", e);
373+
}
374+
}
375+
}
376+
377+
// Later, to stop the conversation:
378+
// if (conversationController) {
379+
// await conversationController.stop();
380+
// }
381+
382+
```
383+
310384
## AIErrorCode
311385

312386
Standardized error codes that [AIError](./ai.aierror.md#aierror_class) can have.
@@ -589,6 +663,7 @@ Generation modalities to be returned in generation responses.
589663
ResponseModality: {
590664
readonly TEXT: "TEXT";
591665
readonly IMAGE: "IMAGE";
666+
readonly AUDIO: "AUDIO";
592667
}
593668
```
594669

docs-devsite/ai.prebuiltvoiceconfig.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ export interface PrebuiltVoiceConfig
2525

2626
| Property | Type | Description |
2727
| --- | --- | --- |
28-
| [voiceConfig](./ai.prebuiltvoiceconfig.md#prebuiltvoiceconfigvoiceconfig) | string | <b><i>(Public Preview)</i></b> The voice name to use for speech synthesis.<!-- -->For a full list of names and demos of what each voice sounds like, see [Chirp 3: HD Voices](https://cloud.google.com/text-to-speech/docs/chirp3-hd)<!-- -->. |
28+
| [voiceName](./ai.prebuiltvoiceconfig.md#prebuiltvoiceconfigvoicename) | string | <b><i>(Public Preview)</i></b> The voice name to use for speech synthesis.<!-- -->For a full list of names and demos of what each voice sounds like, see [Chirp 3: HD Voices](https://cloud.google.com/text-to-speech/docs/chirp3-hd)<!-- -->. |
2929

30-
## PrebuiltVoiceConfig.voiceConfig
30+
## PrebuiltVoiceConfig.voiceName
3131

3232
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
3333
>
@@ -39,5 +39,5 @@ For a full list of names and demos of what each voice sounds like, see [Chirp 3:
3939
<b>Signature:</b>
4040

4141
```typescript
42-
voiceConfig?: string;
42+
voiceName?: string;
4343
```
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
Project: /docs/reference/js/_project.yaml
2+
Book: /docs/reference/_book.yaml
3+
page_type: reference
4+
5+
{% comment %}
6+
DO NOT EDIT THIS FILE!
7+
This is generated by the JS SDK team, and any local changes will be
8+
overwritten. Changes should be made in the source code at
9+
https://github.com/firebase/firebase-js-sdk
10+
{% endcomment %}
11+
12+
# StartAudioConversationOptions interface
13+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
14+
>
15+
16+
Options for [startAudioConversation()](./ai.md#startaudioconversation_01c8e7f)<!-- -->.
17+
18+
<b>Signature:</b>
19+
20+
```typescript
21+
export interface StartAudioConversationOptions
22+
```
23+
24+
## Properties
25+
26+
| Property | Type | Description |
27+
| --- | --- | --- |
28+
| [functionCallingHandler](./ai.startaudioconversationoptions.md#startaudioconversationoptionsfunctioncallinghandler) | (functionCalls: [LiveServerToolCall](./ai.liveservertoolcall.md#liveservertoolcall_interface)<!-- -->\['functionCalls'\]) =&gt; Promise&lt;[Part](./ai.md#part)<!-- -->&gt; | <b><i>(Public Preview)</i></b> An async handler that is called when the model requests a function to be executed. The handler should perform the function call and return the result as a <code>Part</code>, which will then be sent back to the model. |
29+
30+
## StartAudioConversationOptions.functionCallingHandler
31+
32+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
33+
>
34+
35+
An async handler that is called when the model requests a function to be executed. The handler should perform the function call and return the result as a `Part`<!-- -->, which will then be sent back to the model.
36+
37+
<b>Signature:</b>
38+
39+
```typescript
40+
functionCallingHandler?: (functionCalls: LiveServerToolCall['functionCalls']) => Promise<Part>;
41+
```

packages/ai/src/api.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ export * from './requests/schema-builder';
4545
export { ImagenImageFormat } from './requests/imagen-image-format';
4646
export { AIModel, GenerativeModel, LiveGenerativeModel, ImagenModel, AIError };
4747
export { Backend, VertexAIBackend, GoogleAIBackend } from './backend';
48+
export {
49+
startAudioConversation,
50+
AudioConversationController,
51+
StartAudioConversationOptions
52+
} from './methods/live-session-helpers';
4853

4954
declare module '@firebase/component' {
5055
interface NameServiceMapping {

0 commit comments

Comments
 (0)