Deep
Learning
Where the real magic begins. Classical ML hits a wall with images, text, audio, and sequences. Deep learning breaks through it — neural networks learn hierarchical representations that no human could hand-craft. This phase takes you from a single neuron to the Transformer architecture that powers GPT-4 and Claude.
Build Networks from Scratch
& Understand Every Gradient
Before you reach for PyTorch, implement backpropagation by hand. Once you understand why gradients flow the way they do, every training failure becomes diagnosable.
loss.backward() actually does, and why.
import numpy as np class Layer: def __init__(self, in_dim, out_dim): # He initialization for ReLU networks self.W = np.random.randn(in_dim, out_dim) * np.sqrt(2.0 / in_dim) self.b = np.zeros((1, out_dim)) self.dW = self.db = None def forward(self, x): self.x = x # cache for backward pass return x @ self.W + self.b def backward(self, d_out): self.dW = self.x.T @ d_out self.db = d_out.sum(axis=0, keepdims=True) return d_out @ self.W.T # gradient to pass to previous layer def update(self, lr): self.W -= lr * self.dW self.b -= lr * self.db class ReLU: def forward(self, x): self.mask = x > 0 return x * self.mask def backward(self, d_out): return d_out * self.mask # kill gradient where input ≤ 0 class CrossEntropyLoss: def forward(self, logits, y): # Softmax + cross-entropy (numerically stable) shifted = logits - logits.max(axis=1, keepdims=True) exp_s = np.exp(shifted) self.probs = exp_s / exp_s.sum(axis=1, keepdims=True) self.y = y n = logits.shape[0] return -np.log(self.probs[np.arange(n), y] + 1e-9).mean() def backward(self): n = self.probs.shape[0] d = self.probs.copy() d[np.arange(n), self.y] -= 1 return d / n class MLP: def __init__(self, dims): # dims: e.g. [784, 256, 128, 10] self.layers = [Layer(dims[i], dims[i+1]) for i in range(len(dims)-1)] self.relus = [ReLU() for _ in range(len(dims)-2)] self.loss_fn = CrossEntropyLoss() def forward(self, x, y): a = x for i, layer in enumerate(self.layers): z = layer.forward(a) a = self.relus[i].forward(z) if i < len(self.relus) else z return self.loss_fn.forward(a, y) def backward(self): d = self.loss_fn.backward() for i in reversed(range(len(self.layers))): if i < len(self.relus): d = self.relus[i].backward(d) d = self.layers[i].backward(d) def step(self, lr): for layer in self.layers: layer.update(lr) def predict(self, x): a = x for i, layer in enumerate(self.layers): z = layer.forward(a) a = self.relus[i].forward(z) if i < len(self.relus) else z return a.argmax(axis=1) # ── TRAIN ON SYNTHETIC DATA ──────────────────────────────────────────── np.random.seed(42) # 4-class classification, 2D input X = np.random.randn(1000, 2) y = ((X[:, 0] > 0).astype(int) + 2 * (X[:, 1] > 0).astype(int)) net = MLP([2, 64, 32, 4]) lr = 0.05 batch_size = 64 for epoch in range(100): idx = np.random.permutation(len(X)) total_loss = 0 for i in range(0, len(X), batch_size): xb = X[idx[i:i+batch_size]] yb = y[idx[i:i+batch_size]] loss = net.forward(xb, yb) net.backward() net.step(lr) total_loss += loss if epoch % 20 == 0: acc = (net.predict(X) == y).mean() print(f"Epoch {epoch:3d} | loss: {total_loss:.3f} | acc: {acc:.3f}") # ── NUMERICAL GRADIENT CHECK (debug tool) ───────────────────────────── # Verify your analytical gradients match finite differences def numerical_gradient(f, x, eps=1e-5): grad = np.zeros_like(x) for idx in np.ndindex(x.shape): x_plus = x.copy(); x_plus[idx] += eps x_minus = x.copy(); x_minus[idx] -= eps grad[idx] = (f(x_plus) - f(x_minus)) / (2 * eps) return grad
The Premier Framework
for Deep Learning
PyTorch is the dominant framework in both research and industry. Its dynamic computation graph makes debugging feel natural. Every modern LLM was trained with it or a variant.
import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset, random_split import numpy as np # ── TENSORS — the basics ─────────────────────────────────────────────── x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) print(x.shape, x.dtype, x.device) # torch.Size([2, 2]) float32 cpu # Move to GPU if available device = 'cuda' if torch.cuda.is_available() else 'cpu' x = x.to(device) # Common operations (same as NumPy, but on GPU) a = torch.randn(3, 4).to(device) b = torch.zeros(4, 2).to(device) c = a @ b # matrix multiply → (3, 2) d = torch.cat([a, a], dim=1) # concatenate → (3, 8) # Convert between NumPy and PyTorch np_arr = np.random.randn(5) t = torch.from_numpy(np_arr).float() back = t.cpu().numpy() # ── AUTOGRAD — automatic differentiation ────────────────────────────── # requires_grad=True tells PyTorch to track operations on this tensor w = torch.tensor(2.0, requires_grad=True) x = torch.tensor(3.0) y = w * x ** 2 + w ** 3 # y = 3w² + w³ (if x=3: y = 6w + 3w²... wait: y = wx² + w³) y.backward() print(w.grad) # dy/dw = x² + 3w² = 9 + 12 = 21.0 # Context manager: no gradient tracking (for inference) with torch.no_grad(): inference_result = w * x # does not build computation graph # ── nn.MODULE — define your model ────────────────────────────────────── class MLP(nn.Module): def __init__(self, in_dim, hidden, out_dim, dropout=0.3): super().__init__() self.net = nn.Sequential( nn.Linear(in_dim, hidden), nn.LayerNorm(hidden), # normalize before activation nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden, hidden // 2), nn.ReLU(), nn.Linear(hidden // 2, out_dim) ) def forward(self, x): return self.net(x) # PyTorch calls backward() automatically model = MLP(20, 128, 1).to(device) print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") # ── DATASET + DATALOADER ────────────────────────────────────────────── X_raw = torch.randn(2000, 20) y_raw = (X_raw.sum(dim=1) > 0).float().unsqueeze(1) dataset = TensorDataset(X_raw, y_raw) train_ds, val_ds, test_ds = random_split(dataset, [1600, 200, 200]) train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=0) val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=0) # ── THE TRAINING LOOP ────────────────────────────────────────────────── optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4) criterion = nn.BCEWithLogitsLoss() scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50) best_val_loss = float('inf') patience, patience_counter = 10, 0 for epoch in range(100): # ── Training ── model.train() train_loss = 0 for xb, yb in train_loader: xb, yb = xb.to(device), yb.to(device) optimizer.zero_grad() # 1. clear gradients pred = model(xb) # 2. forward pass loss = criterion(pred, yb) # 3. compute loss loss.backward() # 4. backward pass torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip optimizer.step() # 5. update weights train_loss += loss.item() scheduler.step() # ── Validation ── model.eval() val_loss = 0 with torch.no_grad(): for xb, yb in val_loader: xb, yb = xb.to(device), yb.to(device) val_loss += criterion(model(xb), yb).item() # Early stopping if val_loss < best_val_loss: best_val_loss = val_loss patience_counter = 0 torch.save(model.state_dict(), 'best_model.pt') else: patience_counter += 1 if patience_counter >= patience: print(f"Early stopping at epoch {epoch}"); break if epoch % 10 == 0: print(f"Ep {epoch:3d} | train: {train_loss/len(train_loader):.4f}" f" | val: {val_loss/len(val_loader):.4f}" f" | lr: {scheduler.get_last_lr()[0]:.6f}") # ── SAVE & LOAD ──────────────────────────────────────────────────────── torch.save(model.state_dict(), 'model.pt') model_loaded = MLP(20, 128, 1).to(device) model_loaded.load_state_dict(torch.load('model.pt', map_location=device)) model_loaded.eval() # ── INFERENCE ───────────────────────────────────────────────────────── with torch.no_grad(): test_x = torch.randn(10, 20).to(device) logits = model_loaded(test_x) probs = torch.sigmoid(logits) preds = (probs > 0.5).long() print("Predictions:", preds.squeeze().tolist())
clip_grad_norm_(model.parameters(), 1.0) should be in every training loop. It's especially important for RNNs and Transformers.
model = torch.compile(model). One line of code.
Teach Machines
to See
Convolutional Neural Networks exploit the structure of images — local patterns, translation invariance, and hierarchical features. They remain the workhorses of production vision systems.
→ Conv2d(3→64, k=3, p=1) + BN + ReLU → (64 × 224 × 224)
→ MaxPool2d(2) → (64 × 112 × 112)
→ Conv2d(64→128, k=3, p=1) + BN + ReLU → (128 × 112 × 112)
→ MaxPool2d(2) → (128 × 56 × 56)
→ Conv2d(128→256, k=3, p=1) + BN + ReLU→ (256 × 56 × 56)
→ AdaptiveAvgPool2d(1, 1) → (256 × 1 × 1)
→ Flatten → Linear(256 → num_classes) → (num_classes,)
import torch import torch.nn as nn import torch.optim as optim import torchvision import torchvision.transforms as T import torchvision.models as models from torch.utils.data import DataLoader device = 'cuda' if torch.cuda.is_available() else 'cpu' # ── 1. BUILD A CNN FROM SCRATCH ──────────────────────────────────────── class ConvBlock(nn.Module): """Conv → BatchNorm → ReLU — the fundamental CNN building block""" def __init__(self, in_ch, out_ch, kernel=3, stride=1): super().__init__() self.block = nn.Sequential( nn.Conv2d(in_ch, out_ch, kernel, stride=stride, padding=kernel//2, bias=False), # bias=False with BN nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True) ) def forward(self, x): return self.block(x) class SmallCNN(nn.Module): def __init__(self, num_classes=10): super().__init__() self.features = nn.Sequential( ConvBlock(3, 32), ConvBlock(32, 64), nn.MaxPool2d(2), # 32→16 ConvBlock(64, 128), ConvBlock(128, 128), nn.MaxPool2d(2), # 16→8 ConvBlock(128, 256), nn.AdaptiveAvgPool2d((1, 1)), # any input size → 1×1 ) self.classifier = nn.Sequential( nn.Flatten(), nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.4), nn.Linear(128, num_classes) ) def forward(self, x): return self.classifier(self.features(x)) model = SmallCNN(num_classes=10).to(device) print(f"Params: {sum(p.numel() for p in model.parameters()):,}") # ── 2. DATA AUGMENTATION ─────────────────────────────────────────────── train_transform = T.Compose([ T.RandomCrop(32, padding=4), # random crop with padding T.RandomHorizontalFlip(p=0.5), T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), T.RandomRotation(10), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], # ImageNet stats [0.229, 0.224, 0.225]) ]) val_transform = T.Compose([ T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) train_ds = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=train_transform) val_ds = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=val_transform) train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=4, pin_memory=True) val_loader = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=4) # ── 3. TRANSFER LEARNING — ResNet50 ─────────────────────────────────── backbone = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2) # Strategy 1: Freeze backbone, train only head (fast, small datasets) for param in backbone.parameters(): param.requires_grad = False backbone.fc = nn.Sequential( nn.Linear(backbone.fc.in_features, 256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, 10) ) # Strategy 2: Fine-tune all layers with differential learning rates backbone_full = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2) backbone_full.fc = nn.Linear(backbone_full.fc.in_features, 10) optimizer_full = optim.AdamW([ {'params': backbone_full.layer4.parameters(), 'lr': 1e-4}, {'params': backbone_full.layer3.parameters(), 'lr': 5e-5}, {'params': backbone_full.fc.parameters(), 'lr': 1e-3}, ], weight_decay=1e-4) # ── 4. TRAINING WITH AMP (Automatic Mixed Precision) ────────────────── from torch.cuda.amp import autocast, GradScaler scaler = GradScaler() criterion = nn.CrossEntropyLoss(label_smoothing=0.1) optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=5e-4) scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=50) for epoch in range(50): model.train() correct, total = 0, 0 for imgs, labels in train_loader: imgs, labels = imgs.to(device), labels.to(device) optimizer.zero_grad() with autocast(): # float16 for speed out = model(imgs) loss = criterion(out, labels) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() scheduler.step() correct += (out.argmax(1) == labels).sum().item() total += labels.size(0) model.eval() val_correct = 0 with torch.no_grad(): for imgs, labels in val_loader: imgs, labels = imgs.to(device), labels.to(device) val_correct += (model(imgs).argmax(1) == labels).sum().item() print(f"Ep {epoch:2d} | train acc: {correct/total:.3f}" f" | val acc: {val_correct/len(val_ds):.3f}")
Model Sequences,
Time, and Memory
Recurrent networks process sequential data step by step, maintaining a hidden state. LSTMs solve the vanishing gradient problem that cripples vanilla RNNs. Essential for time series, NLP, and audio.
import torch import torch.nn as nn import torch.optim as optim import numpy as np device = 'cuda' if torch.cuda.is_available() else 'cpu' # ── 1. VANILLA RNN — understand the mechanics ────────────────────────── class VanillaRNN(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super().__init__() self.Wx = nn.Linear(input_dim, hidden_dim, bias=False) self.Wh = nn.Linear(hidden_dim, hidden_dim) self.out = nn.Linear(hidden_dim, output_dim) self.tanh = nn.Tanh() def forward(self, x): # x: (batch, seq_len, input_dim) B, T, _ = x.shape h = torch.zeros(B, self.Wh.in_features).to(x.device) outputs = [] for t in range(T): h = self.tanh(self.Wx(x[:, t, :]) + self.Wh(h)) outputs.append(h) last_h = outputs[-1] return self.out(last_h) # classify based on final hidden state # ── 2. LSTM — PyTorch built-in ──────────────────────────────────────── class LSTMClassifier(nn.Module): def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout=0.3, bidirectional=False): super().__init__() self.lstm = nn.LSTM( input_dim, hidden_dim, num_layers=num_layers, dropout=dropout if num_layers > 1 else 0, bidirectional=bidirectional, batch_first=True # (batch, seq, feature) — ALWAYS use this ) d = hidden_dim * (2 if bidirectional else 1) self.head = nn.Sequential( nn.Linear(d, d // 2), nn.ReLU(), nn.Dropout(dropout), nn.Linear(d // 2, output_dim) ) def forward(self, x): # out: (batch, seq, hidden*dirs), (h_n, c_n) out, (h_n, c_n) = self.lstm(x) # Use the final hidden state of the last layer last = out[:, -1, :] return self.head(last) # ── 3. TIME SERIES FORECASTING ──────────────────────────────────────── class LSTMForecaster(nn.Module): def __init__(self, input_dim, hidden_dim, num_layers, forecast_horizon): super().__init__() self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True) self.fc = nn.Linear(hidden_dim, forecast_horizon) def forward(self, x): _, (h_n, _) = self.lstm(x) return self.fc(h_n[-1]) # last layer hidden state → forecast # ── 4. SEQUENCE-TO-SEQUENCE ─────────────────────────────────────────── class Seq2Seq(nn.Module): def __init__(self, vocab_size, embed_dim, hidden_dim): super().__init__() self.embed = nn.Embedding(vocab_size, embed_dim) self.encoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True) self.decoder = nn.LSTM(embed_dim, hidden_dim, batch_first=True) self.fc_out = nn.Linear(hidden_dim, vocab_size) def forward(self, src, tgt): enc_out, (h, c) = self.encoder(self.embed(src)) dec_out, _ = self.decoder(self.embed(tgt), (h, c)) return self.fc_out(dec_out) # ── 5. FULL TRAINING EXAMPLE — Sentiment Classification ─────────────── np.random.seed(42); torch.manual_seed(42) # Fake token sequences (batch, seq_len, features) SEQ_LEN, BATCH, FEAT = 50, 64, 16 X_train = torch.randn(800, SEQ_LEN, FEAT) y_train = torch.randint(0, 2, (800,)) X_val = torch.randn(200, SEQ_LEN, FEAT) y_val = torch.randint(0, 2, (200,)) model = LSTMClassifier(FEAT, 128, 2, 2, dropout=0.3, bidirectional=True).to(device) optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4) criterion = nn.CrossEntropyLoss() for epoch in range(30): model.train() for i in range(0, len(X_train), BATCH): xb = X_train[i:i+BATCH].to(device) yb = y_train[i:i+BATCH].to(device) optimizer.zero_grad() loss = criterion(model(xb), yb) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() model.eval() with torch.no_grad(): val_acc = (model(X_val.to(device)).argmax(1) == y_val.to(device)).float().mean() if epoch % 5 == 0: print(f"Epoch {epoch:2d} | val acc: {val_acc:.3f}")
nn.utils.rnn.pack_padded_sequence when sequences have variable lengths. Prevents the model from attending to padding tokens.
The Architecture That
Powers Modern AI
Every frontier AI system today — GPT-4, Claude, Gemini, DALL-E, Whisper — is built on the Transformer. Understanding attention is now a foundational skill, not an advanced one.
| Property | Encoder (BERT-style) | Decoder (GPT-style) |
|---|---|---|
| Attention direction | Bidirectional (all tokens see all) | Causal (only past tokens) |
| Training objective | Masked Language Modeling | Next token prediction |
| Use cases | Classification, NER, embeddings | Text generation, LLMs |
| Examples | BERT, RoBERTa, DeBERTa | GPT-2/3/4, LLaMA, Claude |
| Inference | One forward pass | Autoregressive (token by token) |
import torch import torch.nn as nn import torch.nn.functional as F import math device = 'cuda' if torch.cuda.is_available() else 'cpu' # ── SCALED DOT-PRODUCT ATTENTION ────────────────────────────────────── class MultiHeadAttention(nn.Module): def __init__(self, d_model, n_heads, dropout=0.1): super().__init__() assert d_model % n_heads == 0 self.h = n_heads self.d_k = d_model // n_heads # One projection matrix for Q, K, V combined (faster) self.qkv = nn.Linear(d_model, 3 * d_model, bias=False) self.proj = nn.Linear(d_model, d_model, bias=False) self.dropout = nn.Dropout(dropout) def forward(self, x, mask=None): B, T, C = x.shape # Project and split into heads qkv = self.qkv(x).reshape(B, T, 3, self.h, self.d_k) qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, h, T, d_k) Q, K, V = qkv[0], qkv[1], qkv[2] # Scaled dot-product attention scale = math.sqrt(self.d_k) scores = (Q @ K.transpose(-2, -1)) / scale # (B, h, T, T) if mask is not None: scores = scores.masked_fill(mask == 0, float('-inf')) attn = self.dropout(F.softmax(scores, dim=-1)) out = (attn @ V).transpose(1, 2).contiguous().reshape(B, T, C) return self.proj(out) # ── TRANSFORMER BLOCK (Pre-LN — modern & stable) ────────────────────── class TransformerBlock(nn.Module): def __init__(self, d_model, n_heads, d_ff, dropout=0.1): super().__init__() self.ln1 = nn.LayerNorm(d_model) self.attn = MultiHeadAttention(d_model, n_heads, dropout) self.ln2 = nn.LayerNorm(d_model) self.ff = nn.Sequential( nn.Linear(d_model, d_ff), nn.GELU(), # GELU outperforms ReLU in Transformers nn.Dropout(dropout), nn.Linear(d_ff, d_model), nn.Dropout(dropout), ) def forward(self, x, mask=None): x = x + self.attn(self.ln1(x), mask) # pre-LN residual x = x + self.ff(self.ln2(x)) return x # ── GPT-STYLE LANGUAGE MODEL ────────────────────────────────────────── class GPT(nn.Module): def __init__(self, vocab_size, d_model, n_heads, n_layers, d_ff, max_seq_len, dropout=0.1): super().__init__() self.tok_embed = nn.Embedding(vocab_size, d_model) self.pos_embed = nn.Embedding(max_seq_len, d_model) # learned self.drop = nn.Dropout(dropout) self.blocks = nn.ModuleList([ TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers) ]) self.ln_final = nn.LayerNorm(d_model) self.head = nn.Linear(d_model, vocab_size, bias=False) # Weight tying — share embedding and output weights (GPT-2 trick) self.head.weight = self.tok_embed.weight self._init_weights() def _init_weights(self): for m in self.modules(): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, mean=0, std=0.02) if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.Embedding): nn.init.normal_(m.weight, mean=0, std=0.02) def forward(self, idx, targets=None): B, T = idx.shape pos = torch.arange(T, device=idx.device) x = self.drop(self.tok_embed(idx) + self.pos_embed(pos)) # Causal (autoregressive) mask — upper triangle = 0 mask = torch.tril(torch.ones(T, T, device=idx.device)).unsqueeze(0).unsqueeze(0) for block in self.blocks: x = block(x, mask) logits = self.head(self.ln_final(x)) loss = None if targets is not None: loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) return logits, loss @torch.no_grad() def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): for _ in range(max_new_tokens): logits, _ = self(idx[:, -512:]) # keep last 512 tokens logits = logits[:, -1, :] / temperature if top_k is not None: v, _ = torch.topk(logits, top_k) logits[logits < v[:, [-1]]] = float('-inf') probs = F.softmax(logits, dim=-1) next_tok = torch.multinomial(probs, num_samples=1) idx = torch.cat([idx, next_tok], dim=1) return idx # ── INSTANTIATE A SMALL GPT ──────────────────────────────────────────── model = GPT( vocab_size=50257, # GPT-2 vocabulary d_model=256, n_heads=8, n_layers=6, d_ff=1024, max_seq_len=512, dropout=0.1 ).to(device) print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") # Training uses exactly the same loop as PyTorch section above optimizer = torch.optim.AdamW( model.parameters(), lr=3e-4, betas=(0.9, 0.95), weight_decay=0.1) # ── USING HUGGINGFACE INSTEAD (production choice) ───────────────────── from transformers import GPT2LMHeadModel, GPT2Tokenizer from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments # Classify with BERT-style encoder (much easier than from scratch) tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') clf_model = AutoModelForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=2) texts = ["This movie is amazing!", "I hated every minute of it."] tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True) with torch.no_grad(): logits = clf_model(**tokens).logits preds = logits.argmax(dim=-1) print("Predictions:", ["negative", "positive"][p] for p in preds.tolist())
F.scaled_dot_product_attention (PyTorch 2.0+).
The Craft of Getting
Models to Actually Train
Knowing the architecture is only half the battle. This is the practitioner knowledge that separates people who get results from people who get NaN loss and give up. Regularization, optimizers, debugging — all of it here.
| Optimizer | Best For | Key Params | Notes |
|---|---|---|---|
| SGD + Momentum | CNNs, ResNets with careful tuning | lr=0.1, momentum=0.9 | Best final accuracy with proper schedule; hard to tune |
| Adam | Transformers, MLPs, general use | lr=3e-4, β=(0.9, 0.999) | Adaptive. Flawed weight decay — use AdamW instead |
| AdamW | Transformers, LLMs, default choice | lr=3e-4, wd=0.01-0.1 | Decoupled weight decay. The modern standard |
| Lion | Large model fine-tuning | lr=3e-5, wd=1.0 | Uses only sign of gradient. Memory efficient |
| Muon | Language model pretraining | — | Orthogonalization-based. Recent SOTA for LM training |
import torch import torch.nn as nn import torch.optim as optim from torch.cuda.amp import autocast, GradScaler import numpy as np import math device = 'cuda' if torch.cuda.is_available() else 'cpu' # ── 1. REGULARIZATION TECHNIQUES ────────────────────────────────────── class RegularizedMLP(nn.Module): def __init__(self, in_dim, hidden, out_dim): super().__init__() self.net = nn.Sequential( nn.Linear(in_dim, hidden), nn.BatchNorm1d(hidden), # BN before activation nn.GELU(), nn.Dropout(0.3), # dropout after activation nn.Linear(hidden, hidden // 2), nn.LayerNorm(hidden // 2), # or use LayerNorm nn.GELU(), nn.Dropout(0.2), nn.Linear(hidden // 2, out_dim) ) def forward(self, x): return self.net(x) # ── 2. LEARNING RATE WARMUP + COSINE DECAY ──────────────────────────── def get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps, min_lr_ratio=0.1): def lr_lambda(step): if step < warmup_steps: return step / max(1, warmup_steps) # linear warmup progress = (step - warmup_steps) / max(1, total_steps - warmup_steps) cosine = 0.5 * (1 + math.cos(math.pi * progress)) return max(min_lr_ratio, cosine) # cosine decay return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) model = RegularizedMLP(20, 128, 2).to(device) optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.05, betas=(0.9, 0.999), eps=1e-8) scheduler = get_cosine_schedule_with_warmup( optimizer, warmup_steps=100, total_steps=1000) # ── 3. GRADIENT ACCUMULATION (simulate large batch on small GPU) ─────── ACCUM_STEPS = 4 # effective batch = batch_size × ACCUM_STEPS criterion = nn.CrossEntropyLoss() model.train() optimizer.zero_grad() for step, (xb, yb) in enumerate(train_loader): # assume train_loader exists xb, yb = xb.to(device), yb.to(device) loss = criterion(model(xb), yb) / ACCUM_STEPS # scale loss loss.backward() if (step + 1) % ACCUM_STEPS == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() # ── 4. MIXED PRECISION (AMP) ────────────────────────────────────────── scaler = GradScaler() for xb, yb in train_loader: xb, yb = xb.to(device), yb.to(device) optimizer.zero_grad() with autocast(dtype=torch.bfloat16): # bfloat16 on Ampere+ pred = model(xb) loss = criterion(pred, yb) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() # ── 5. LEARNING RATE FINDER ──────────────────────────────────────────── def lr_finder(model, optimizer, criterion, train_loader, start_lr=1e-7, end_lr=10, num_iter=100): lrs, losses = [], [] mult = (end_lr / start_lr) ** (1 / num_iter) lr = start_lr for param_group in optimizer.param_groups: param_group['lr'] = lr model.train() for i, (xb, yb) in enumerate(train_loader): if i >= num_iter: break xb, yb = xb.to(device), yb.to(device) optimizer.zero_grad() loss = criterion(model(xb), yb) loss.backward() optimizer.step() lrs.append(lr); losses.append(loss.item()) lr *= mult for pg in optimizer.param_groups: pg['lr'] = lr if loss.item() > 4 * min(losses): break # stop if diverging return lrs, losses # Plot: steepest descent region → your optimal LR # ── 6. DEBUGGING TOOLKIT ────────────────────────────────────────────── def diagnose_model(model, x_sample, y_sample, criterion): """Quick sanity checks before training.""" model.eval() with torch.no_grad(): out = model(x_sample.to(device)) # 1. Check output shape print(f"Output shape: {out.shape}") # 2. Check initial loss matches random baseline loss = criterion(out, y_sample.to(device)) n_classes = out.shape[-1] expected = -math.log(1.0 / n_classes) print(f"Initial loss: {loss:.3f} (expected ~{expected:.3f} for {n_classes} classes)") # 3. Check gradient flow model.train() out = model(x_sample.to(device)) loss = criterion(out, y_sample.to(device)) loss.backward() grads = [(n, p.grad.abs().mean().item()) for n, p in model.named_parameters() if p.grad is not None] print("Gradient norms:") for name, gnorm in grads[-5:]: print(f" {name:40s} {gnorm:.6f}") # 4. Overfit one batch print("\nOverfitting 1 batch:") optimizer = optim.Adam(model.parameters(), lr=0.01) for i in range(100): optimizer.zero_grad() loss = criterion(model(x_sample.to(device)), y_sample.to(device)) loss.backward() optimizer.step() if i % 20 == 0: print(f" step {i}: {loss.item():.4f}") # Should reach near-zero. If not: model, loss, or loop has a bug. # ── 7. COMMON NaN DEBUGGING ──────────────────────────────────────────── def add_nan_hooks(model): """Register hooks that print which layer produced NaN.""" def hook(module, inp, out, name=''): if isinstance(out, torch.Tensor) and torch.isnan(out).any(): print(f"NaN detected in {name} output!") for name, layer in model.named_modules(): layer.register_forward_hook( lambda m, i, o, n=name: hook(m, i, o, n)) # ── 8. EMA — Exponential Moving Average (improves eval performance) ──── class EMA: def __init__(self, model, decay=0.9999): self.model = model self.decay = decay self.shadow = {k: v.clone().detach() for k, v in model.state_dict().items()} def update(self): with torch.no_grad(): for k, v in self.model.state_dict().items(): self.shadow[k] = self.decay * self.shadow[k] + (1 - self.decay) * v def apply_shadow(self, model): model.load_state_dict(self.shadow) ema = EMA(model, decay=0.9999) # Call ema.update() after each optimizer.step() during training # Use ema.apply_shadow(eval_model) for evaluation
Loss explodes after N steps: Add gradient clipping. Lower LR. Increase weight decay.
Loss doesn't decrease at all: Check LR (too small?), data normalization, gradient flow.
Train low, val high (overfit): Add dropout, weight decay, data augmentation, reduce model size.
Both high (underfit): Larger model, more training steps, lower weight decay, higher LR.
Loss oscillates wildly: Reduce LR. Add LR warmup. Check for bad batches in dataset.