AI7 min read
Attention Mechanisms
Focus on what matters in sequences.
Dr. Patricia Moore
December 18, 2025
0.0k0
AI that focuses.
What is Attention?
Mechanism that lets model focus on relevant parts of input.
Like reading a long document and highlighting important parts!
Why Attention?
Problem with RNNs:
- Long sequences → Information gets lost
- Can't remember beginning by the end
Solution:
- Look back at all previous inputs
- Focus on relevant parts
Attention in Machine Translation
Translating: "The cat sat on the mat" → "Le chat..."
When generating "chat", attention focuses on "cat"!
Basic Attention Mechanism
import numpy as np
def attention(query, keys, values):
"""
query: What we're looking for
keys: What we're looking in
values: What we return
"""
# Calculate attention scores
scores = np.dot(query, keys.T) # Similarity
# Normalize to probabilities
weights = softmax(scores)
# Weighted sum of values
output = np.dot(weights, values)
return output, weights
# Example
query = np.array([1, 0, 0]) # Current word encoding
keys = np.array([
[1, 0, 0], # Previous word 1
[0, 1, 0], # Previous word 2
[0, 0, 1] # Previous word 3
])
values = keys # Usually same as keys
output, weights = attention(query, keys, values)
print(f"Attention weights: {weights}") # [0.7, 0.2, 0.1]
# Focuses most on word 1!
Scaled Dot-Product Attention
def scaled_dot_product_attention(Q, K, V, mask=None):
"""
Q: Query matrix
K: Key matrix
V: Value matrix
"""
d_k = Q.shape[-1]
# Calculate attention scores
scores = np.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)
# Apply mask (for decoder to not look ahead)
if mask is not None:
scores = scores + (mask * -1e9)
# Softmax
attention_weights = softmax(scores, axis=-1)
# Weighted sum
output = np.matmul(attention_weights, V)
return output, attention_weights
Attention in Seq2Seq
from tensorflow.keras.layers import Layer
import tensorflow as tf
class BahdanauAttention(Layer):
def __init__(self, units):
super().__init__()
self.W1 = Dense(units)
self.W2 = Dense(units)
self.V = Dense(1)
def call(self, query, values):
# query: decoder hidden state [batch, hidden]
# values: encoder outputs [batch, seq_len, hidden]
# Expand query to match values shape
query_with_time = tf.expand_dims(query, 1)
# Score calculation
score = self.V(tf.nn.tanh(
self.W1(query_with_time) + self.W2(values)
))
# Attention weights
attention_weights = tf.nn.softmax(score, axis=1)
# Context vector
context = attention_weights * values
context = tf.reduce_sum(context, axis=1)
return context, attention_weights
# Use in decoder
class Decoder(Layer):
def __init__(self, vocab_size, embedding_dim, hidden_units):
super().__init__()
self.embedding = Embedding(vocab_size, embedding_dim)
self.gru = GRU(hidden_units, return_sequences=True, return_state=True)
self.fc = Dense(vocab_size)
self.attention = BahdanauAttention(hidden_units)
def call(self, x, hidden, encoder_outputs):
# Apply attention
context, attention_weights = self.attention(hidden, encoder_outputs)
# Embed input
x = self.embedding(x)
# Concatenate context and embedded input
x = tf.concat([tf.expand_dims(context, 1), x], axis=-1)
# Pass through GRU
output, state = self.gru(x)
# Final prediction
x = self.fc(output)
return x, state, attention_weights
Self-Attention
Each element attends to all elements (including itself):
def self_attention(X):
"""
X: Input sequence [batch, seq_len, d_model]
"""
# Create Q, K, V from same input
Q = X @ Wq
K = X @ Wk
V = X @ Wv
# Scaled dot-product attention
output = scaled_dot_product_attention(Q, K, V)
return output
# Example: "The cat sat"
# "cat" can attend to "The", "cat", "sat"
# Learns relationships between all words
Multi-Head Attention
Multiple attention mechanisms in parallel:
class MultiHeadAttention(Layer):
def __init__(self, d_model, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % num_heads == 0
self.depth = d_model // num_heads
self.wq = Dense(d_model)
self.wk = Dense(d_model)
self.wv = Dense(d_model)
self.dense = Dense(d_model)
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, query, key, value):
batch_size = tf.shape(query)[0]
# Linear projections
query = self.wq(query)
key = self.wk(key)
value = self.wv(value)
# Split into multiple heads
query = self.split_heads(query, batch_size)
key = self.split_heads(key, batch_size)
value = self.split_heads(value, batch_size)
# Scaled dot-product attention for each head
scaled_attention = scaled_dot_product_attention(query, key, value)
# Concatenate heads
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
# Final linear projection
output = self.dense(concat_attention)
return output
Visualizing Attention
import matplotlib.pyplot as plt
import seaborn as sns
def plot_attention(attention, sentence, translation):
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(attention, xticklabels=sentence, yticklabels=translation, ax=ax)
ax.set_xlabel('Input')
ax.set_ylabel('Output')
plt.show()
# Shows which input words the model focused on for each output word
Applications
- Machine translation
- Text summarization
- Image captioning
- Question answering
- Speech recognition
Remember
- Attention allows focusing on relevant parts
- Self-attention enables Transformers
- Multi-head attention learns different relationships
- Visualization helps understand what model learned
#AI#Advanced#Attention