AI6 min read
Data Augmentation
Increase training data artificially.
Robert Anderson
December 18, 2025
0.0k0
Create more training data.
What is Data Augmentation?
Creating new training examples from existing ones.
Goal: More data → Better model
Image Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Define augmentations
datagen = ImageDataGenerator(
rotation_range=20, # Rotate up to 20 degrees
width_shift_range=0.2, # Shift left/right
height_shift_range=0.2, # Shift up/down
horizontal_flip=True, # Mirror image
zoom_range=0.2, # Zoom in/out
fill_mode='nearest'
)
# Apply to images
augmented_images = datagen.flow(images, batch_size=32)
# Use in training
model.fit(augmented_images, epochs=50)
Albumentations (Better for Images)
import albumentations as A
import cv2
# Define pipeline
transform = A.Compose([
A.RandomRotate90(),
A.Flip(),
A.Transpose(),
A.GaussNoise(),
A.OneOf([
A.MotionBlur(p=0.2),
A.MedianBlur(blur_limit=3, p=0.1),
A.Blur(blur_limit=3, p=0.1),
], p=0.2),
A.RandomBrightnessContrast(p=0.2),
])
# Apply
image = cv2.imread('photo.jpg')
augmented = transform(image=image)['image']
Text Augmentation
import nlpaug.augmenter.word as naw
# Synonym replacement
aug = naw.SynonymAug(aug_src='wordnet')
text = "The food in Miami is amazing"
augmented = aug.augment(text)
print(augmented)
# "The food in Miami is fantastic"
# Back translation (more realistic)
aug = naw.BackTranslationAug(
from_model_name='facebook/wmt19-en-de',
to_model_name='facebook/wmt19-de-en'
)
augmented = aug.augment(text)
Audio Augmentation
import librosa
import numpy as np
def augment_audio(audio, sr):
# Time stretch
audio_stretched = librosa.effects.time_stretch(audio, rate=1.1)
# Pitch shift
audio_pitched = librosa.effects.pitch_shift(audio, sr=sr, n_steps=2)
# Add noise
noise = np.random.randn(len(audio))
audio_noisy = audio + 0.005 * noise
return [audio_stretched, audio_pitched, audio_noisy]
Tabular Data Augmentation
from sklearn.utils import resample
import pandas as pd
# Oversample minority class
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]
# Resample with replacement
df_minority_upsampled = resample(
df_minority,
replace=True,
n_samples=len(df_majority),
random_state=42
)
# Combine
df_balanced = pd.concat([df_majority, df_minority_upsampled])
SMOTE for Tabular Data
from imblearn.over_sampling import SMOTE
# Create synthetic samples
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)
print(f"Original: {len(X)}")
print(f"After SMOTE: {len(X_resampled)}")
MixUp (Advanced)
Mix two images:
def mixup(x1, y1, x2, y2, alpha=0.2):
# Random mixing ratio
lam = np.random.beta(alpha, alpha)
# Mix images
x = lam * x1 + (1 - lam) * x2
# Mix labels
y = lam * y1 + (1 - lam) * y2
return x, y
# Use in training
x_mix, y_mix = mixup(image1, label1, image2, label2)
CutMix
Cut part of image and paste another:
def cutmix(x1, y1, x2, y2, alpha=1.0):
lam = np.random.beta(alpha, alpha)
# Random box
h, w = x1.shape[:2]
cut_h = int(h * np.sqrt(1 - lam))
cut_w = int(w * np.sqrt(1 - lam))
cx = np.random.randint(w)
cy = np.random.randint(h)
x1y1 = np.clip([cx - cut_w // 2, cy - cut_h // 2], 0, [w, h])
x2y2 = np.clip([cx + cut_w // 2, cy + cut_h // 2], 0, [w, h])
# Replace region
x1[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]] = x2[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]]
return x1, y1
Best Practices
- Images: Always augment (flip, rotate, crop)
- Text: Be careful (meaning shouldn't change)
- Validation: Never augment validation data!
- Realistic: Augmentations should be realistic
Remember
- More data = better model
- Augment during training
- Don't overdo it
- Keep validation data clean
#AI#Intermediate#Data