7
7
}
8
8
9
9
10
- def do_stt ():
10
+ def do_stt (audio , text_state = "" ):
11
11
transcription = ""
12
12
r = sr .Recognizer ()
13
- with sr . Microphone () as source :
14
- r . adjust_for_ambient_noise ( source , 0.2 )
15
- audio = r . listen ( source )
13
+
14
+ # Convert to AudioData
15
+ audio_data = sr . AudioData ( sample_rate = audio [ 0 ], frame_data = audio [ 1 ], sample_width = 4 )
16
16
17
17
try :
18
- transcription = r .recognize_whisper (audio , language = "english" , model = "base.en" )
18
+ transcription = r .recognize_whisper (audio_data , language = "english" , model = "base.en" )
19
19
except sr .UnknownValueError :
20
20
print ("Whisper could not understand audio" )
21
21
except sr .RequestError as e :
22
22
print ("Could not request results from Whisper" , e )
23
23
24
24
input_hijack .update ({"state" : True , "value" : [transcription , transcription ]})
25
- return transcription
25
+
26
+ text_state += transcription + " "
27
+ return text_state , text_state
26
28
27
29
28
30
def update_hijack (val ):
@@ -31,7 +33,12 @@ def update_hijack(val):
31
33
32
34
33
35
def ui ():
34
- speech_button = gr .Button (value = "🎙️" )
35
- output_transcription = gr .Textbox (label = "STT-Input" , placeholder = "Speech Preview. Click \" Generate\" to send" , interactive = True )
36
- output_transcription .change (fn = update_hijack , inputs = [output_transcription ])
37
- speech_button .click (do_stt , outputs = [output_transcription ])
36
+ tr_state = gr .State (value = "" )
37
+ output_transcription = gr .Textbox (label = "STT-Input" ,
38
+ placeholder = "Speech Preview. Click \" Generate\" to send" ,
39
+ interactive = True )
40
+ output_transcription .change (fn = update_hijack , inputs = [output_transcription ], outputs = [tr_state ])
41
+ with gr .Row ():
42
+ audio = gr .Audio (source = "microphone" )
43
+ transcribe_button = gr .Button (value = "Transcribe" )
44
+ transcribe_button .click (do_stt , inputs = [audio , tr_state ], outputs = [output_transcription , tr_state ])
0 commit comments