Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Module Objective
Master the full training loop — forward pass, loss, backward pass, optimizer step, computation graph, gradient flow, and train a Mini-GPT on TinyShakespeare.
1. The Training Loop: 4 Steps
for batch in data:
1. Forward: logits = model(x)
2. Loss: loss = criterion(logits, y)
3. Backward: loss.backward() # Compute gradients
4. Step: optimizer.step() # Update weights
optimizer.zero_grad() # Clear gradients
2. Computation Graph: Autograd Engine
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x + 1
y.backward()
print(x.grad) # tensor([7.]) → dy/dx = 2x + 3
PyTorch builds a dynamic graph
Every.backward()traverses it to compute ∂L/∂w
3. Gradient Descent: The Math
$$
w_{t+1} = w_t - \eta \cdot \nabla_w L
$$
- $ \eta $: learning rate
- $ \nabla_w L $: gradient from
.backward()
4. Full Training Setup: TinyShakespeare
# === 1. Download Dataset ===
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt', 'r').read()
# === 2. Build Vocabulary ===
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
5. Mini-GPT Model (64-dim)
import torch
import torch.nn as nn
import torch.nn.functional as F
class MiniGPT(nn.Module):
def __init__(self, vocab_size, n_embd=64, n_head=4, n_layer=4, block_size=128):
super().__init__()
self.block_size = block_size
self.token_emb = nn.Embedding(vocab_size, n_embd)
self.pos_emb = nn.Embedding(block_size, n_embd)
self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd)
self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, (nn.Linear, nn.Embedding)):
nn.init.normal_(m.weight, std=0.02)
def forward(self, idx, targets=None):
B, T = idx.shape
x = self.token_emb(idx) + self.pos_emb(torch.arange(T, device=idx.device))
for block in self.blocks:
x = block(x)
x = self.ln_f(x)
logits = self.lm_head(x)
loss = None
if targets is not None:
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss
6. Transformer Block (Causal)
class TransformerBlock(nn.Module):
def __init__(self, n_embd, n_head):
super().__init__()
self.ln1 = nn.LayerNorm(n_embd)
self.attn = CausalSelfAttention(n_embd, n_head)
self.ln2 = nn.LayerNorm(n_embd)
self.ff = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.GELU(),
nn.Linear(4 * n_embd, n_embd)
)
def forward(self, x):
x = x + self.attn(self.ln1(x))
x = x + self.ff(self.ln2(x))
return x
class CausalSelfAttention(nn.Module):
def __init__(self, n_embd, n_head):
super().__init__()
self.n_head = n_head
self.d_k = n_embd // n_head
self.Wq = nn.Linear(n_embd, n_embd)
self.Wk = nn.Linear(n_embd, n_embd)
self.Wv = nn.Linear(n_embd, n_embd)
self.Wo = nn.Linear(n_embd, n_embd)
def forward(self, x):
B, T, C = x.shape
q = self.Wq(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
k = self.Wk(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
v = self.Wv(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
att = (q @ k.transpose(-2, -1)) * (1.0 / (self.d_k ** 0.5))
mask = torch.tril(torch.ones(T, T, device=x.device))
att = att.masked_fill(mask == 0, float('-inf'))
att = F.softmax(att, dim=-1)
y = att @ v
y = y.transpose(1, 2).contiguous().view(B, T, C)
return self.Wo(y)
7. Data Loader: Get Batch
block_size = 128
batch_size = 32
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
return x, y
8. Full Training Loop
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
steps = 5000
for step in range(steps):
# === 1. Get batch ===
xb, yb = get_batch('train')
# === 2. Forward ===
logits, loss = model(xb, yb)
# === 3. Backward ===
optimizer.zero_grad()
loss.backward()
# === 4. Step ===
optimizer.step()
if step % 500 == 0:
# Validation
model.eval()
val_loss = 0
with torch.no_grad():
for _ in range(10):
x_val, y_val = get_batch('val')
_, loss_val = model(x_val, y_val)
val_loss += loss_val.item()
model.train()
print(f"Step {step} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss/10:.4f}")
9. Generate Text
@torch.no_grad()
def generate(model, idx, max_new_tokens=200):
model.eval()
for _ in range(max_new_tokens):
idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
logits, _ = model(idx_cond)
logits = logits[:, -1, :]
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat([idx, idx_next], dim=1)
model.train()
return idx
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
generated = generate(model, context)
print(decode(generated[0].tolist()))
Output:
ROMEO: I will not be a good man...
10. Visualize Computation Graph
from torchviz import make_dot
x = torch.randn(1, 8, 64)
y = torch.randint(0, vocab_size, (1, 8))
logits, loss = model(x, y)
graph = make_dot(loss, params=dict(model.named_parameters()))
graph.render("computation_graph", format="png")
Shows full backward path from loss → embeddings
11. Gradient Flow: Check Health
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad mean={param.grad.abs().mean():.6f}, std={param.grad.std():.6f}")
Healthy: non-zero, not exploding
12. Learning Rate Scheduling
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps)
# In loop:
scheduler.step()
13. Summary Table
| Step | Code | Purpose |
|---|---|---|
| Forward | logits, loss = model(x, y) |
Compute predictions |
| Loss | cross_entropy |
Measure error |
| Backward | loss.backward() |
Compute ∇ |
| Step | optimizer.step() |
Update weights |
| Zero Grad | optimizer.zero_grad() |
Clear old ∇ |
14. Practice Exercises
- Add gradient clipping:
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) - Plot loss curve
- Add early stopping
- Try SGD vs Adam
- Inspect attention maps during training
15. Key Takeaways
| Check | Insight |
|---|---|
| Check | Training loop = 4 lines |
| Check | Autograd builds graph automatically |
| Check | .backward() = chain rule |
| Check | TinyShakespeare = real language |
| Check | You just trained GPT |
Full Copy-Paste Training Script
# === FULL TRAINING SCRIPT ===
import torch
import torch.nn as nn
import torch.nn.functional as F
# [MiniGPT, TransformerBlock, CausalSelfAttention classes here]
# Data
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt').read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
data = torch.tensor(encode(text), dtype=torch.long)
train_data, val_data = data[:int(0.9*len(data))], data[int(0.9*len(data)):]
block_size = 128
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (32,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
return x, y
# Model & Training
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for step in range(5000):
xb, yb = get_batch('train')
logits, loss = model(xb, yb)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if step % 1000 == 0:
print(f"Step {step}, Loss: {loss.item():.4f}")
# Generate
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
print(decode(model.generate(context, 200)[0].tolist()))
Final Words
You just trained a language model from scratch.
- Forward → Loss → Backward → Step
- Autograd handles the math
- TinyShakespeare learns Shakespeare
End of Module
You now control the full training pipeline.
Next: Pretrain on 1B tokens.
Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Training Loop & Backpropagation
Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare
Module Objective
Master the full training loop — forward pass, loss, backward pass, optimizer step, computation graph, gradient flow, and train a Mini-GPT on TinyShakespeare.
1. The Training Loop: 4 Steps
for batch in data:
1. Forward: logits = model(x)
2. Loss: loss = criterion(logits, y)
3. Backward: loss.backward() # Compute gradients
4. Step: optimizer.step() # Update weights
optimizer.zero_grad() # Clear gradients
2. Computation Graph: Autograd Engine
x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x + 1
y.backward()
print(x.grad) # tensor([7.]) → dy/dx = 2x + 3
PyTorch builds a dynamic graph
Every.backward()traverses it to compute ∂L/∂w
3. Gradient Descent: The Math
$$
w_{t+1} = w_t - \eta \cdot \nabla_w L
$$
- $ \eta $: learning rate
- $ \nabla_w L $: gradient from
.backward()
4. Full Training Setup: TinyShakespeare
# === 1. Download Dataset ===
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt', 'r').read()
# === 2. Build Vocabulary ===
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
5. Mini-GPT Model (64-dim)
import torch
import torch.nn as nn
import torch.nn.functional as F
class MiniGPT(nn.Module):
def __init__(self, vocab_size, n_embd=64, n_head=4, n_layer=4, block_size=128):
super().__init__()
self.block_size = block_size
self.token_emb = nn.Embedding(vocab_size, n_embd)
self.pos_emb = nn.Embedding(block_size, n_embd)
self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd)
self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, (nn.Linear, nn.Embedding)):
nn.init.normal_(m.weight, std=0.02)
def forward(self, idx, targets=None):
B, T = idx.shape
x = self.token_emb(idx) + self.pos_emb(torch.arange(T, device=idx.device))
for block in self.blocks:
x = block(x)
x = self.ln_f(x)
logits = self.lm_head(x)
loss = None
if targets is not None:
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss
6. Transformer Block (Causal)
class TransformerBlock(nn.Module):
def __init__(self, n_embd, n_head):
super().__init__()
self.ln1 = nn.LayerNorm(n_embd)
self.attn = CausalSelfAttention(n_embd, n_head)
self.ln2 = nn.LayerNorm(n_embd)
self.ff = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.GELU(),
nn.Linear(4 * n_embd, n_embd)
)
def forward(self, x):
x = x + self.attn(self.ln1(x))
x = x + self.ff(self.ln2(x))
return x
class CausalSelfAttention(nn.Module):
def __init__(self, n_embd, n_head):
super().__init__()
self.n_head = n_head
self.d_k = n_embd // n_head
self.Wq = nn.Linear(n_embd, n_embd)
self.Wk = nn.Linear(n_embd, n_embd)
self.Wv = nn.Linear(n_embd, n_embd)
self.Wo = nn.Linear(n_embd, n_embd)
def forward(self, x):
B, T, C = x.shape
q = self.Wq(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
k = self.Wk(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
v = self.Wv(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
att = (q @ k.transpose(-2, -1)) * (1.0 / (self.d_k ** 0.5))
mask = torch.tril(torch.ones(T, T, device=x.device))
att = att.masked_fill(mask == 0, float('-inf'))
att = F.softmax(att, dim=-1)
y = att @ v
y = y.transpose(1, 2).contiguous().view(B, T, C)
return self.Wo(y)
7. Data Loader: Get Batch
block_size = 128
batch_size = 32
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
return x, y
8. Full Training Loop
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
steps = 5000
for step in range(steps):
# === 1. Get batch ===
xb, yb = get_batch('train')
# === 2. Forward ===
logits, loss = model(xb, yb)
# === 3. Backward ===
optimizer.zero_grad()
loss.backward()
# === 4. Step ===
optimizer.step()
if step % 500 == 0:
# Validation
model.eval()
val_loss = 0
with torch.no_grad():
for _ in range(10):
x_val, y_val = get_batch('val')
_, loss_val = model(x_val, y_val)
val_loss += loss_val.item()
model.train()
print(f"Step {step} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss/10:.4f}")
9. Generate Text
@torch.no_grad()
def generate(model, idx, max_new_tokens=200):
model.eval()
for _ in range(max_new_tokens):
idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
logits, _ = model(idx_cond)
logits = logits[:, -1, :]
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat([idx, idx_next], dim=1)
model.train()
return idx
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
generated = generate(model, context)
print(decode(generated[0].tolist()))
Output:
ROMEO: I will not be a good man...
10. Visualize Computation Graph
from torchviz import make_dot
x = torch.randn(1, 8, 64)
y = torch.randint(0, vocab_size, (1, 8))
logits, loss = model(x, y)
graph = make_dot(loss, params=dict(model.named_parameters()))
graph.render("computation_graph", format="png")
Shows full backward path from loss → embeddings
11. Gradient Flow: Check Health
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad mean={param.grad.abs().mean():.6f}, std={param.grad.std():.6f}")
Healthy: non-zero, not exploding
12. Learning Rate Scheduling
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps)
# In loop:
scheduler.step()
13. Summary Table
| Step | Code | Purpose |
|---|---|---|
| Forward | logits, loss = model(x, y) |
Compute predictions |
| Loss | cross_entropy |
Measure error |
| Backward | loss.backward() |
Compute ∇ |
| Step | optimizer.step() |
Update weights |
| Zero Grad | optimizer.zero_grad() |
Clear old ∇ |
14. Practice Exercises
- Add gradient clipping:
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) - Plot loss curve
- Add early stopping
- Try SGD vs Adam
- Inspect attention maps during training
15. Key Takeaways
| Check | Insight |
|---|---|
| Check | Training loop = 4 lines |
| Check | Autograd builds graph automatically |
| Check | .backward() = chain rule |
| Check | TinyShakespeare = real language |
| Check | You just trained GPT |
Full Copy-Paste Training Script
# === FULL TRAINING SCRIPT ===
import torch
import torch.nn as nn
import torch.nn.functional as F
# [MiniGPT, TransformerBlock, CausalSelfAttention classes here]
# Data
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt').read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
data = torch.tensor(encode(text), dtype=torch.long)
train_data, val_data = data[:int(0.9*len(data))], data[int(0.9*len(data)):]
block_size = 128
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (32,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
return x, y
# Model & Training
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for step in range(5000):
xb, yb = get_batch('train')
logits, loss = model(xb, yb)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if step % 1000 == 0:
print(f"Step {step}, Loss: {loss.item():.4f}")
# Generate
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
print(decode(model.generate(context, 200)[0].tolist()))
Final Words
You just trained a language model from scratch.
- Forward → Loss → Backward → Step
- Autograd handles the math
- TinyShakespeare learns Shakespeare
End of Module
You now control the full training pipeline.
Next: Pretrain on 1B tokens.