AI8 min read

Reinforcement Learning Basics

Train AI through rewards and penalties.

Dr. Patricia Moore
December 18, 2025
0.0k0

AI that learns by doing.

What is Reinforcement Learning?

Agent learns by interacting with environment.

Key Idea: Actions → Rewards/Penalties → Learning

Like training a dog with treats!

Key Concepts

Agent: The learner (AI)
Environment: The world
State: Current situation
Action: What agent can do
Reward: Feedback from environment

Example - Robot Navigation

State: Robot's position in room
Actions: Move forward, turn left, turn right
Reward: +10 for reaching goal, -1 for hitting wall

Q-Learning

Simple RL algorithm:

import numpy as np

# Q-table: State x Action → Expected reward
Q = np.zeros((num_states, num_actions))

# Hyperparameters
learning_rate = 0.1
discount = 0.95
epsilon = 0.1  # Exploration rate

for episode in range(1000):
    state = env.reset()
    done = False
    
    while not done:
        # Choose action (epsilon-greedy)
        if np.random.random() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q[state])  # Exploit
        
        # Take action
        next_state, reward, done, _ = env.step(action)
        
        # Update Q-value
        old_q = Q[state, action]
        next_max = np.max(Q[next_state])
        new_q = old_q + learning_rate * (reward + discount * next_max - old_q)
        Q[state, action] = new_q
        
        state = next_state

# Use learned policy
state = env.reset()
while not done:
    action = np.argmax(Q[state])
    state, reward, done, _ = env.step(action)

OpenAI Gym

Standard RL environment:

import gym

# Create environment
env = gym.make('CartPole-v1')

# Reset environment
state = env.reset()

for _ in range(1000):
    env.render()
    
    # Take random action
    action = env.action_space.sample()
    
    # Get result
    next_state, reward, done, info = env.step(action)
    
    if done:
        break

env.close()

Deep Q-Network (DQN)

Q-Learning with neural network:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import deque
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount rate
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.model = self._build_model()
    
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=0.001))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model.predict(next_state)[0])
            
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Train DQN
agent = DQNAgent(state_size=4, action_size=2)

for episode in range(1000):
    state = env.reset()
    state = np.reshape(state, [1, 4])
    
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        
        if done:
            print(f"Episode: {episode}, Score: {time}")
            break
    
    if len(agent.memory) > 32:
        agent.replay(32)

Policy Gradient

Learn policy directly:

# Instead of Q-values, output action probabilities
model = Sequential([
    Dense(24, input_dim=state_size, activation='relu'),
    Dense(24, activation='relu'),
    Dense(action_size, activation='softmax')  # Probabilities
])

# Sample action from probability distribution
probs = model.predict(state)[0]
action = np.random.choice(action_size, p=probs)

Applications

  • Game playing (AlphaGo, Atari)
  • Robotics
  • Self-driving cars
  • Resource management
  • Trading algorithms
  • Recommendation systems

Challenges

  • Sparse rewards
  • Exploration vs exploitation
  • Sample inefficiency
  • Stability

Remember

  • RL learns through trial and error
  • Q-Learning for discrete actions
  • DQN for complex environments
  • Requires lots of episodes
#AI#Advanced#RL