AI6 min read

Data Augmentation

Increase training data artificially.

Robert Anderson
December 18, 2025
0.0k0

Create more training data.

What is Data Augmentation?

Creating new training examples from existing ones.

Goal: More data → Better model

Image Augmentation

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define augmentations
datagen = ImageDataGenerator(
    rotation_range=20,      # Rotate up to 20 degrees
    width_shift_range=0.2,  # Shift left/right
    height_shift_range=0.2, # Shift up/down
    horizontal_flip=True,   # Mirror image
    zoom_range=0.2,         # Zoom in/out
    fill_mode='nearest'
)

# Apply to images
augmented_images = datagen.flow(images, batch_size=32)

# Use in training
model.fit(augmented_images, epochs=50)

Albumentations (Better for Images)

import albumentations as A
import cv2

# Define pipeline
transform = A.Compose([
    A.RandomRotate90(),
    A.Flip(),
    A.Transpose(),
    A.GaussNoise(),
    A.OneOf([
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.Blur(blur_limit=3, p=0.1),
    ], p=0.2),
    A.RandomBrightnessContrast(p=0.2),
])

# Apply
image = cv2.imread('photo.jpg')
augmented = transform(image=image)['image']

Text Augmentation

import nlpaug.augmenter.word as naw

# Synonym replacement
aug = naw.SynonymAug(aug_src='wordnet')
text = "The food in Miami is amazing"
augmented = aug.augment(text)
print(augmented)
# "The food in Miami is fantastic"

# Back translation (more realistic)
aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de',
    to_model_name='facebook/wmt19-de-en'
)
augmented = aug.augment(text)

Audio Augmentation

import librosa
import numpy as np

def augment_audio(audio, sr):
    # Time stretch
    audio_stretched = librosa.effects.time_stretch(audio, rate=1.1)
    
    # Pitch shift
    audio_pitched = librosa.effects.pitch_shift(audio, sr=sr, n_steps=2)
    
    # Add noise
    noise = np.random.randn(len(audio))
    audio_noisy = audio + 0.005 * noise
    
    return [audio_stretched, audio_pitched, audio_noisy]

Tabular Data Augmentation

from sklearn.utils import resample
import pandas as pd

# Oversample minority class
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

# Resample with replacement
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

# Combine
df_balanced = pd.concat([df_majority, df_minority_upsampled])

SMOTE for Tabular Data

from imblearn.over_sampling import SMOTE

# Create synthetic samples
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

print(f"Original: {len(X)}")
print(f"After SMOTE: {len(X_resampled)}")

MixUp (Advanced)

Mix two images:

def mixup(x1, y1, x2, y2, alpha=0.2):
    # Random mixing ratio
    lam = np.random.beta(alpha, alpha)
    
    # Mix images
    x = lam * x1 + (1 - lam) * x2
    
    # Mix labels
    y = lam * y1 + (1 - lam) * y2
    
    return x, y

# Use in training
x_mix, y_mix = mixup(image1, label1, image2, label2)

CutMix

Cut part of image and paste another:

def cutmix(x1, y1, x2, y2, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    
    # Random box
    h, w = x1.shape[:2]
    cut_h = int(h * np.sqrt(1 - lam))
    cut_w = int(w * np.sqrt(1 - lam))
    
    cx = np.random.randint(w)
    cy = np.random.randint(h)
    
    x1y1 = np.clip([cx - cut_w // 2, cy - cut_h // 2], 0, [w, h])
    x2y2 = np.clip([cx + cut_w // 2, cy + cut_h // 2], 0, [w, h])
    
    # Replace region
    x1[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]] = x2[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]]
    
    return x1, y1

Best Practices

  1. Images: Always augment (flip, rotate, crop)
  2. Text: Be careful (meaning shouldn't change)
  3. Validation: Never augment validation data!
  4. Realistic: Augmentations should be realistic

Remember

  • More data = better model
  • Augment during training
  • Don't overdo it
  • Keep validation data clean
#AI#Intermediate#Data