Skip to content

Commit 7da742e

Browse files
authored
Merge pull request oobabooga#207 from EliasVincent/stt-extension
Extension: Whisper Speech-To-Text Input
2 parents ebef4a5 + 02e1113 commit 7da742e

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
git+https://github.com/Uberi/speech_recognition.git@010382b
2+
openai-whisper
3+
soundfile
4+
ffmpeg

extensions/whisper_stt/script.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import gradio as gr
2+
import speech_recognition as sr
3+
4+
input_hijack = {
5+
'state': False,
6+
'value': ["", ""]
7+
}
8+
9+
10+
def do_stt(audio, text_state=""):
11+
transcription = ""
12+
r = sr.Recognizer()
13+
14+
# Convert to AudioData
15+
audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
16+
17+
try:
18+
transcription = r.recognize_whisper(audio_data, language="english", model="base.en")
19+
except sr.UnknownValueError:
20+
print("Whisper could not understand audio")
21+
except sr.RequestError as e:
22+
print("Could not request results from Whisper", e)
23+
24+
input_hijack.update({"state": True, "value": [transcription, transcription]})
25+
26+
text_state += transcription + " "
27+
return text_state, text_state
28+
29+
30+
def update_hijack(val):
31+
input_hijack.update({"state": True, "value": [val, val]})
32+
return val
33+
34+
35+
def auto_transcribe(audio, audio_auto, text_state=""):
36+
if audio is None:
37+
return "", ""
38+
if audio_auto:
39+
return do_stt(audio, text_state)
40+
return "", ""
41+
42+
43+
def ui():
44+
tr_state = gr.State(value="")
45+
output_transcription = gr.Textbox(label="STT-Input",
46+
placeholder="Speech Preview. Click \"Generate\" to send",
47+
interactive=True)
48+
output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state])
49+
audio_auto = gr.Checkbox(label="Auto-Transcribe", value=True)
50+
with gr.Row():
51+
audio = gr.Audio(source="microphone")
52+
audio.change(fn=auto_transcribe, inputs=[audio, audio_auto, tr_state], outputs=[output_transcription, tr_state])
53+
transcribe_button = gr.Button(value="Transcribe")
54+
transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state])

0 commit comments

Comments
 (0)