Transformers Architecture
Understand the Transformer model.
The architecture behind modern NLP.
What are Transformers?
Neural network architecture using only attention mechanisms.
**Revolutionary**: No RNNs or CNNs needed!
Transformer Components
**Encoder**: Processes input sequence **Decoder**: Generates output sequence **Attention**: Focus mechanism **Positional Encoding**: Add position information
Simple Transformer
```python import tensorflow as tf
class TransformerBlock(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, ff_dim, dropout=0.1): super().__init__() self.attention = tf.keras.layers.MultiHeadAttention( num_heads=num_heads, key_dim=d_model ) self.ffn = tf.keras.Sequential([ tf.keras.layers.Dense(ff_dim, activation='relu'), tf.keras.layers.Dense(d_model) ]) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(dropout) self.dropout2 = tf.keras.layers.Dropout(dropout) def call(self, inputs, training=False): # Multi-head attention attn_output = self.attention(inputs, inputs) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(inputs + attn_output) # Feed forward ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) return self.layernorm2(out1 + ffn_output) ```
Positional Encoding
Add position information:
```python def positional_encoding(length, depth): positions = np.arange(length)[:, np.newaxis] depths = np.arange(depth)[np.newaxis, :] / depth angle_rates = 1 / (10000 ** depths) angle_rads = positions * angle_rates pos_encoding = np.concatenate([ np.sin(angle_rads[:, 0::2]), np.cos(angle_rads[:, 1::2]) ], axis=-1) return tf.cast(pos_encoding, dtype=tf.float32)
Add to embeddings embeddings = embeddings + positional_encoding(seq_length, d_model) ```
Full Transformer Model
```python class Transformer(tf.keras.Model): def __init__(self, num_layers, d_model, num_heads, ff_dim, vocab_size, max_length, dropout=0.1): super().__init__() self.embedding = tf.keras.layers.Embedding(vocab_size, d_model) self.pos_encoding = positional_encoding(max_length, d_model) self.encoder_layers = [ TransformerBlock(d_model, num_heads, ff_dim, dropout) for _ in range(num_layers) ] self.dropout = tf.keras.layers.Dropout(dropout) self.final_layer = tf.keras.layers.Dense(vocab_size) def call(self, inputs, training=False): # Embedding and position encoding x = self.embedding(inputs) x *= tf.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:tf.shape(inputs)[1], :] x = self.dropout(x, training=training) # Encoder layers for encoder_layer in self.encoder_layers: x = encoder_layer(x, training=training) # Output return self.final_layer(x)
Create model model = Transformer( num_layers=4, d_model=128, num_heads=8, ff_dim=512, vocab_size=10000, max_length=100 ) ```
Training Transformer
```python # Compile model.compile( optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'] )
Train model.fit( train_dataset, epochs=10, validation_data=val_dataset ) ```
Famous Transformers
**BERT**: Bidirectional encoder **GPT**: Generative pre-training **T5**: Text-to-text transfer **RoBERTa**: Optimized BERT **XLNet**: Permutation language modeling
Applications
- Text generation - Translation - Question answering - Text classification - Summarization
Remember
- Transformers use only attention - Process sequences in parallel (fast!) - Need positional encoding - Foundation of modern NLP