AI6 min read
Speech Recognition with AI
Convert speech to text with AI.
Dr. Kevin Brown
December 18, 2025
0.0k0
Turn spoken words into text.
What is Speech Recognition?
Convert audio into written text.
Use cases: Voice assistants, transcription, accessibility!
Using Speech Recognition
# Install
# pip install SpeechRecognition
import speech_recognition as sr
# Create recognizer
recognizer = sr.Recognizer()
# From microphone
with sr.Microphone() as source:
print("Speak something...")
audio = recognizer.listen(source)
try:
text = recognizer.recognize_google(audio)
print(f"You said: {text}")
except sr.UnknownValueError:
print("Could not understand audio")
except sr.RequestError as e:
print(f"Error: {e}")
From Audio File
# Load audio file
with sr.AudioFile('audio.wav') as source:
audio = recognizer.record(source)
text = recognizer.recognize_google(audio)
print(text)
Whisper (OpenAI)
State-of-the-art speech recognition:
# Install
# pip install openai-whisper
import whisper
# Load model
model = whisper.load_model("base") # tiny, base, small, medium, large
# Transcribe
result = model.transcribe("audio.mp3")
print(result["text"])
# With timestamps
segments = result["segments"]
for segment in segments:
start = segment["start"]
end = segment["end"]
text = segment["text"]
print(f"[{start:.2f}s - {end:.2f}s]: {text}")
Real-time Transcription
import pyaudio
import wave
# Record audio
def record_audio(filename, duration=5):
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK
)
print("Recording...")
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
print("Done recording")
stream.stop_stream()
stream.close()
p.terminate()
# Save
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
# Use it
record_audio("recording.wav", duration=5)
result = model.transcribe("recording.wav")
print(result["text"])
Different Languages
# Specify language
result = model.transcribe("audio.mp3", language="es") # Spanish
print(result["text"])
# Auto-detect language
result = model.transcribe("audio.mp3")
print(f"Detected language: {result['language']}")
Voice Commands
def listen_for_command():
with sr.Microphone() as source:
print("Listening...")
audio = recognizer.listen(source)
try:
command = recognizer.recognize_google(audio).lower()
return command
except:
return None
# Command handler
while True:
command = listen_for_command()
if command:
if "hello" in command:
print("Hello there!")
elif "time" in command:
from datetime import datetime
now = datetime.now()
print(f"Current time: {now.strftime('%H:%M')}")
elif "stop" in command:
print("Goodbye!")
break
else:
print(f"You said: {command}")
Speaker Diarization
Identify who is speaking:
# Install
# pip install pyannote.audio
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
# Diarize
diarization = pipeline("audio.wav")
# Print results
for turn, _, speaker in diarization.itertracks(yield_label=True):
print(f"Speaker {speaker}: {turn.start:.1f}s - {turn.end:.1f}s")
Best Practices
Good audio: Minimize background noise
Sample rate: Use 16kHz for most models
Segment long audio: Break into smaller chunks
Post-processing: Clean up text output
Remember
- Whisper is most accurate
- Works with multiple languages
- Needs clear audio for best results
- Can run locally (no API needed)
#AI#Advanced#Speech