AI8 min read
Transformers Architecture
Understand the Transformer model.
Dr. Emily Watson
December 18, 2025
0.0k0
The architecture behind modern NLP.
What are Transformers?
Neural network architecture using only attention mechanisms.
Revolutionary: No RNNs or CNNs needed!
Transformer Components
Encoder: Processes input sequence
Decoder: Generates output sequence
Attention: Focus mechanism
Positional Encoding: Add position information
Simple Transformer
import tensorflow as tf
class TransformerBlock(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):
super().__init__()
self.attention = tf.keras.layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=d_model
)
self.ffn = tf.keras.Sequential([
tf.keras.layers.Dense(ff_dim, activation='relu'),
tf.keras.layers.Dense(d_model)
])
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(dropout)
self.dropout2 = tf.keras.layers.Dropout(dropout)
def call(self, inputs, training=False):
# Multi-head attention
attn_output = self.attention(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
# Feed forward
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
Positional Encoding
Add position information:
def positional_encoding(length, depth):
positions = np.arange(length)[:, np.newaxis]
depths = np.arange(depth)[np.newaxis, :] / depth
angle_rates = 1 / (10000 ** depths)
angle_rads = positions * angle_rates
pos_encoding = np.concatenate([
np.sin(angle_rads[:, 0::2]),
np.cos(angle_rads[:, 1::2])
], axis=-1)
return tf.cast(pos_encoding, dtype=tf.float32)
# Add to embeddings
embeddings = embeddings + positional_encoding(seq_length, d_model)
Full Transformer Model
class Transformer(tf.keras.Model):
def __init__(self, num_layers, d_model, num_heads, ff_dim,
vocab_size, max_length, dropout=0.1):
super().__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
self.pos_encoding = positional_encoding(max_length, d_model)
self.encoder_layers = [
TransformerBlock(d_model, num_heads, ff_dim, dropout)
for _ in range(num_layers)
]
self.dropout = tf.keras.layers.Dropout(dropout)
self.final_layer = tf.keras.layers.Dense(vocab_size)
def call(self, inputs, training=False):
# Embedding and position encoding
x = self.embedding(inputs)
x *= tf.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:tf.shape(inputs)[1], :]
x = self.dropout(x, training=training)
# Encoder layers
for encoder_layer in self.encoder_layers:
x = encoder_layer(x, training=training)
# Output
return self.final_layer(x)
# Create model
model = Transformer(
num_layers=4,
d_model=128,
num_heads=8,
ff_dim=512,
vocab_size=10000,
max_length=100
)
Training Transformer
# Compile
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Train
model.fit(
train_dataset,
epochs=10,
validation_data=val_dataset
)
Famous Transformers
BERT: Bidirectional encoder
GPT: Generative pre-training
T5: Text-to-text transfer
RoBERTa: Optimized BERT
XLNet: Permutation language modeling
Applications
- Text generation
- Translation
- Question answering
- Text classification
- Summarization
Remember
- Transformers use only attention
- Process sequences in parallel (fast!)
- Need positional encoding
- Foundation of modern NLP
#AI#Advanced#Transformers