1111from agora_realtime_ai_api .rtc import Channel , ChatMessage , RtcEngine , RtcOptions
1212
1313from .logger import setup_logger
14- from .realtime .struct import InputAudioBufferCommitted , InputAudioBufferSpeechStarted , InputAudioBufferSpeechStopped , ItemCreated , RateLimitsUpdated , ResponseAudioDelta , ResponseAudioDone , ResponseAudioTranscriptDelta , ResponseAudioTranscriptDone , ResponseContentPartAdded , ResponseContentPartDone , ResponseCreated , ResponseDone , ResponseOutputItemAdded , ResponseOutputItemDone , ServerVADUpdateParams , SessionUpdate , SessionUpdateParams , SessionUpdated , Voices , to_json
14+ from .realtime .struct import InputAudioBufferCommitted , InputAudioBufferSpeechStarted , InputAudioBufferSpeechStopped , InputAudioTranscription , ItemCreated , ItemInputAudioTranscriptionCompleted , RateLimitsUpdated , ResponseAudioDelta , ResponseAudioDone , ResponseAudioTranscriptDelta , ResponseAudioTranscriptDone , ResponseContentPartAdded , ResponseContentPartDone , ResponseCreated , ResponseDone , ResponseOutputItemAdded , ResponseOutputItemDone , ServerVADUpdateParams , SessionUpdate , SessionUpdateParams , SessionUpdated , Voices , to_json
1515from .realtime .connection import RealtimeApiConnection
1616from .tools import ClientToolCallResponse , ToolContext
1717from .utils import PCMWriter
@@ -102,6 +102,7 @@ async def setup_and_run_agent(
102102 modalities = ["text" , "audio" ],
103103 temperature = 0.8 ,
104104 max_response_output_tokens = "inf" ,
105+ input_audio_transcription = InputAudioTranscription (model = "whisper-1" )
105106 )
106107 )
107108 )
@@ -190,7 +191,7 @@ def callback(agora_rtc_conn: RTCConnection, conn_info: RTCConnInfo, reason):
190191 raise
191192
192193 async def rtc_to_model (self ) -> None :
193- if self .subscribe_user is None :
194+ while self .subscribe_user is None or self . channel . get_audio_frames ( self . subscribe_user ) is None :
194195 await asyncio .sleep (0.1 )
195196
196197 audio_frames = self .channel .get_audio_frames (self .subscribe_user )
@@ -242,7 +243,7 @@ async def _process_model_messages(self) -> None:
242243 # logger.info("Received audio message")
243244 self .audio_queue .put_nowait (base64 .b64decode (message .delta ))
244245 # loop.call_soon_threadsafe(self.audio_queue.put_nowait, base64.b64decode(message.delta))
245- logger .info (f"TMS:ResponseAudioDelta: response_id:{ message .response_id } ,item_id: { message .item_id } " )
246+ logger .debug (f"TMS:ResponseAudioDelta: response_id:{ message .response_id } ,item_id: { message .item_id } " )
246247 case ResponseAudioTranscriptDelta ():
247248 # logger.info(f"Received text message {message=}")
248249 asyncio .create_task (self .channel .chat .send_message (
@@ -267,6 +268,13 @@ async def _process_model_messages(self) -> None:
267268 case InputAudioBufferSpeechStopped ():
268269 logger .info (f"TMS:InputAudioBufferSpeechStopped: item_id: { message .item_id } " )
269270 pass
271+ case ItemInputAudioTranscriptionCompleted ():
272+ logger .info (f"ItemInputAudioTranscriptionCompleted: { message = } " )
273+ asyncio .create_task (self .channel .chat .send_message (
274+ ChatMessage (
275+ message = to_json (message ), msg_id = message .item_id
276+ )
277+ ))
270278 # InputAudioBufferCommitted
271279 case InputAudioBufferCommitted ():
272280 pass
0 commit comments