Attention Mechanisms
Focus on what matters in sequences.
AI that focuses.
What is Attention?
Mechanism that lets model focus on relevant parts of input.
Like reading a long document and highlighting important parts!
Why Attention?
**Problem with RNNs**: - Long sequences → Information gets lost - Can't remember beginning by the end
**Solution**: - Look back at all previous inputs - Focus on relevant parts
Attention in Machine Translation
Translating: "The cat sat on the mat" → "Le chat..."
When generating "chat", attention focuses on "cat"!
Basic Attention Mechanism
```python import numpy as np
def attention(query, keys, values): """ query: What we're looking for keys: What we're looking in values: What we return """ # Calculate attention scores scores = np.dot(query, keys.T) # Similarity # Normalize to probabilities weights = softmax(scores) # Weighted sum of values output = np.dot(weights, values) return output, weights
Example query = np.array([1, 0, 0]) # Current word encoding keys = np.array([ [1, 0, 0], # Previous word 1 [0, 1, 0], # Previous word 2 [0, 0, 1] # Previous word 3 ]) values = keys # Usually same as keys
output, weights = attention(query, keys, values) print(f"Attention weights: {weights}") # [0.7, 0.2, 0.1] # Focuses most on word 1! ```
Scaled Dot-Product Attention
```python def scaled_dot_product_attention(Q, K, V, mask=None): """ Q: Query matrix K: Key matrix V: Value matrix """ d_k = Q.shape[-1] # Calculate attention scores scores = np.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k) # Apply mask (for decoder to not look ahead) if mask is not None: scores = scores + (mask * -1e9) # Softmax attention_weights = softmax(scores, axis=-1) # Weighted sum output = np.matmul(attention_weights, V) return output, attention_weights ```
Attention in Seq2Seq
```python from tensorflow.keras.layers import Layer import tensorflow as tf
class BahdanauAttention(Layer): def __init__(self, units): super().__init__() self.W1 = Dense(units) self.W2 = Dense(units) self.V = Dense(1) def call(self, query, values): # query: decoder hidden state [batch, hidden] # values: encoder outputs [batch, seq_len, hidden] # Expand query to match values shape query_with_time = tf.expand_dims(query, 1) # Score calculation score = self.V(tf.nn.tanh( self.W1(query_with_time) + self.W2(values) )) # Attention weights attention_weights = tf.nn.softmax(score, axis=1) # Context vector context = attention_weights * values context = tf.reduce_sum(context, axis=1) return context, attention_weights
Use in decoder class Decoder(Layer): def __init__(self, vocab_size, embedding_dim, hidden_units): super().__init__() self.embedding = Embedding(vocab_size, embedding_dim) self.gru = GRU(hidden_units, return_sequences=True, return_state=True) self.fc = Dense(vocab_size) self.attention = BahdanauAttention(hidden_units) def call(self, x, hidden, encoder_outputs): # Apply attention context, attention_weights = self.attention(hidden, encoder_outputs) # Embed input x = self.embedding(x) # Concatenate context and embedded input x = tf.concat([tf.expand_dims(context, 1), x], axis=-1) # Pass through GRU output, state = self.gru(x) # Final prediction x = self.fc(output) return x, state, attention_weights ```
Self-Attention
Each element attends to all elements (including itself):
```python def self_attention(X): """ X: Input sequence [batch, seq_len, d_model] """ # Create Q, K, V from same input Q = X @ Wq K = X @ Wk V = X @ Wv # Scaled dot-product attention output = scaled_dot_product_attention(Q, K, V) return output
Example: "The cat sat" # "cat" can attend to "The", "cat", "sat" # Learns relationships between all words ```
Multi-Head Attention
Multiple attention mechanisms in parallel:
```python class MultiHeadAttention(Layer): def __init__(self, d_model, num_heads): super().__init__() self.num_heads = num_heads self.d_model = d_model assert d_model % num_heads == 0 self.depth = d_model // num_heads self.wq = Dense(d_model) self.wk = Dense(d_model) self.wv = Dense(d_model) self.dense = Dense(d_model) def split_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, query, key, value): batch_size = tf.shape(query)[0] # Linear projections query = self.wq(query) key = self.wk(key) value = self.wv(value) # Split into multiple heads query = self.split_heads(query, batch_size) key = self.split_heads(key, batch_size) value = self.split_heads(value, batch_size) # Scaled dot-product attention for each head scaled_attention = scaled_dot_product_attention(query, key, value) # Concatenate heads concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # Final linear projection output = self.dense(concat_attention) return output ```
Visualizing Attention
```python import matplotlib.pyplot as plt import seaborn as sns
def plot_attention(attention, sentence, translation): fig, ax = plt.subplots(figsize=(10, 10)) sns.heatmap(attention, xticklabels=sentence, yticklabels=translation, ax=ax) ax.set_xlabel('Input') ax.set_ylabel('Output') plt.show()
Shows which input words the model focused on for each output word ```
Applications
- Machine translation - Text summarization - Image captioning - Question answering - Speech recognition
Remember
- Attention allows focusing on relevant parts - Self-attention enables Transformers - Multi-head attention learns different relationships - Visualization helps understand what model learned