AI7 min read

Attention Mechanisms

Focus on what matters in sequences.

Dr. Patricia Moore
December 18, 2025
0.0k0

AI that focuses.

What is Attention?

Mechanism that lets model focus on relevant parts of input.

Like reading a long document and highlighting important parts!

Why Attention?

Problem with RNNs:

  • Long sequences → Information gets lost
  • Can't remember beginning by the end

Solution:

  • Look back at all previous inputs
  • Focus on relevant parts

Attention in Machine Translation

Translating: "The cat sat on the mat" → "Le chat..."

When generating "chat", attention focuses on "cat"!

Basic Attention Mechanism

import numpy as np

def attention(query, keys, values):
    """
    query: What we're looking for
    keys: What we're looking in
    values: What we return
    """
    # Calculate attention scores
    scores = np.dot(query, keys.T)  # Similarity
    
    # Normalize to probabilities
    weights = softmax(scores)
    
    # Weighted sum of values
    output = np.dot(weights, values)
    
    return output, weights

# Example
query = np.array([1, 0, 0])  # Current word encoding
keys = np.array([
    [1, 0, 0],  # Previous word 1
    [0, 1, 0],  # Previous word 2
    [0, 0, 1]   # Previous word 3
])
values = keys  # Usually same as keys

output, weights = attention(query, keys, values)
print(f"Attention weights: {weights}")  # [0.7, 0.2, 0.1]
# Focuses most on word 1!

Scaled Dot-Product Attention

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Q: Query matrix
    K: Key matrix
    V: Value matrix
    """
    d_k = Q.shape[-1]
    
    # Calculate attention scores
    scores = np.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)
    
    # Apply mask (for decoder to not look ahead)
    if mask is not None:
        scores = scores + (mask * -1e9)
    
    # Softmax
    attention_weights = softmax(scores, axis=-1)
    
    # Weighted sum
    output = np.matmul(attention_weights, V)
    
    return output, attention_weights

Attention in Seq2Seq

from tensorflow.keras.layers import Layer
import tensorflow as tf

class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)
    
    def call(self, query, values):
        # query: decoder hidden state [batch, hidden]
        # values: encoder outputs [batch, seq_len, hidden]
        
        # Expand query to match values shape
        query_with_time = tf.expand_dims(query, 1)
        
        # Score calculation
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time) + self.W2(values)
        ))
        
        # Attention weights
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # Context vector
        context = attention_weights * values
        context = tf.reduce_sum(context, axis=1)
        
        return context, attention_weights

# Use in decoder
class Decoder(Layer):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super().__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(hidden_units, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = BahdanauAttention(hidden_units)
    
    def call(self, x, hidden, encoder_outputs):
        # Apply attention
        context, attention_weights = self.attention(hidden, encoder_outputs)
        
        # Embed input
        x = self.embedding(x)
        
        # Concatenate context and embedded input
        x = tf.concat([tf.expand_dims(context, 1), x], axis=-1)
        
        # Pass through GRU
        output, state = self.gru(x)
        
        # Final prediction
        x = self.fc(output)
        
        return x, state, attention_weights

Self-Attention

Each element attends to all elements (including itself):

def self_attention(X):
    """
    X: Input sequence [batch, seq_len, d_model]
    """
    # Create Q, K, V from same input
    Q = X @ Wq
    K = X @ Wk
    V = X @ Wv
    
    # Scaled dot-product attention
    output = scaled_dot_product_attention(Q, K, V)
    
    return output

# Example: "The cat sat"
# "cat" can attend to "The", "cat", "sat"
# Learns relationships between all words

Multi-Head Attention

Multiple attention mechanisms in parallel:

class MultiHeadAttention(Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % num_heads == 0
        
        self.depth = d_model // num_heads
        
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, query, key, value):
        batch_size = tf.shape(query)[0]
        
        # Linear projections
        query = self.wq(query)
        key = self.wk(key)
        value = self.wv(value)
        
        # Split into multiple heads
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        # Scaled dot-product attention for each head
        scaled_attention = scaled_dot_product_attention(query, key, value)
        
        # Concatenate heads
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        
        # Final linear projection
        output = self.dense(concat_attention)
        
        return output

Visualizing Attention

import matplotlib.pyplot as plt
import seaborn as sns

def plot_attention(attention, sentence, translation):
    fig, ax = plt.subplots(figsize=(10, 10))
    
    sns.heatmap(attention, xticklabels=sentence, yticklabels=translation, ax=ax)
    
    ax.set_xlabel('Input')
    ax.set_ylabel('Output')
    plt.show()

# Shows which input words the model focused on for each output word

Applications

  • Machine translation
  • Text summarization
  • Image captioning
  • Question answering
  • Speech recognition

Remember

  • Attention allows focusing on relevant parts
  • Self-attention enables Transformers
  • Multi-head attention learns different relationships
  • Visualization helps understand what model learned
#AI#Advanced#Attention