AI6 min read

Speech Recognition with AI

Convert speech to text with AI.

Dr. Kevin Brown
December 18, 2025
0.0k0

Turn spoken words into text.

What is Speech Recognition?

Convert audio into written text.

Use cases: Voice assistants, transcription, accessibility!

Using Speech Recognition

# Install
# pip install SpeechRecognition

import speech_recognition as sr

# Create recognizer
recognizer = sr.Recognizer()

# From microphone
with sr.Microphone() as source:
    print("Speak something...")
    audio = recognizer.listen(source)
    
    try:
        text = recognizer.recognize_google(audio)
        print(f"You said: {text}")
    except sr.UnknownValueError:
        print("Could not understand audio")
    except sr.RequestError as e:
        print(f"Error: {e}")

From Audio File

# Load audio file
with sr.AudioFile('audio.wav') as source:
    audio = recognizer.record(source)
    text = recognizer.recognize_google(audio)
    print(text)

Whisper (OpenAI)

State-of-the-art speech recognition:

# Install
# pip install openai-whisper

import whisper

# Load model
model = whisper.load_model("base")  # tiny, base, small, medium, large

# Transcribe
result = model.transcribe("audio.mp3")
print(result["text"])

# With timestamps
segments = result["segments"]
for segment in segments:
    start = segment["start"]
    end = segment["end"]
    text = segment["text"]
    print(f"[{start:.2f}s - {end:.2f}s]: {text}")

Real-time Transcription

import pyaudio
import wave

# Record audio
def record_audio(filename, duration=5):
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    
    p = pyaudio.PyAudio()
    
    stream = p.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK
    )
    
    print("Recording...")
    frames = []
    
    for _ in range(0, int(RATE / CHUNK * duration)):
        data = stream.read(CHUNK)
        frames.append(data)
    
    print("Done recording")
    
    stream.stop_stream()
    stream.close()
    p.terminate()
    
    # Save
    wf = wave.open(filename, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

# Use it
record_audio("recording.wav", duration=5)
result = model.transcribe("recording.wav")
print(result["text"])

Different Languages

# Specify language
result = model.transcribe("audio.mp3", language="es")  # Spanish
print(result["text"])

# Auto-detect language
result = model.transcribe("audio.mp3")
print(f"Detected language: {result['language']}")

Voice Commands

def listen_for_command():
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
        
        try:
            command = recognizer.recognize_google(audio).lower()
            return command
        except:
            return None

# Command handler
while True:
    command = listen_for_command()
    
    if command:
        if "hello" in command:
            print("Hello there!")
        elif "time" in command:
            from datetime import datetime
            now = datetime.now()
            print(f"Current time: {now.strftime('%H:%M')}")
        elif "stop" in command:
            print("Goodbye!")
            break
        else:
            print(f"You said: {command}")

Speaker Diarization

Identify who is speaking:

# Install
# pip install pyannote.audio

from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

# Diarize
diarization = pipeline("audio.wav")

# Print results
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"Speaker {speaker}: {turn.start:.1f}s - {turn.end:.1f}s")

Best Practices

Good audio: Minimize background noise
Sample rate: Use 16kHz for most models
Segment long audio: Break into smaller chunks
Post-processing: Clean up text output

Remember

  • Whisper is most accurate
  • Works with multiple languages
  • Needs clear audio for best results
  • Can run locally (no API needed)
#AI#Advanced#Speech