Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare

Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare

Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare


Module Objective

Master the full training loopforward pass, loss, backward pass, optimizer step, computation graph, gradient flow, and train a Mini-GPT on TinyShakespeare.


1. The Training Loop: 4 Steps

for batch in data:
    1. Forward:  logits = model(x)
    2. Loss:     loss = criterion(logits, y)
    3. Backward: loss.backward()          # Compute gradients
    4. Step:     optimizer.step()         # Update weights
         optimizer.zero_grad()            # Clear gradients

2. Computation Graph: Autograd Engine

x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x + 1
y.backward()
print(x.grad)  # tensor([7.]) → dy/dx = 2x + 3

PyTorch builds a dynamic graph
Every .backward() traverses it to compute ∂L/∂w


3. Gradient Descent: The Math

$$
w_{t+1} = w_t - \eta \cdot \nabla_w L
$$

  • $ \eta $: learning rate
  • $ \nabla_w L $: gradient from .backward()

4. Full Training Setup: TinyShakespeare

# === 1. Download Dataset ===
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt', 'r').read()

# === 2. Build Vocabulary ===
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

5. Mini-GPT Model (64-dim)

import torch
import torch.nn as nn
import torch.nn.functional as F

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, n_embd=64, n_head=4, n_layer=4, block_size=128):
        super().__init__()
        self.block_size = block_size
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Linear, nn.Embedding)):
            nn.init.normal_(m.weight, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_emb(idx) + self.pos_emb(torch.arange(T, device=idx.device))
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

6. Transformer Block (Causal)

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttention(n_embd, n_head)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ff = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.n_head = n_head
        self.d_k = n_embd // n_head
        self.Wq = nn.Linear(n_embd, n_embd)
        self.Wk = nn.Linear(n_embd, n_embd)
        self.Wv = nn.Linear(n_embd, n_embd)
        self.Wo = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        B, T, C = x.shape
        q = self.Wq(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
        k = self.Wk(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
        v = self.Wv(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / (self.d_k ** 0.5))
        mask = torch.tril(torch.ones(T, T, device=x.device))
        att = att.masked_fill(mask == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.Wo(y)

7. Data Loader: Get Batch

block_size = 128
batch_size = 32

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

8. Full Training Loop

model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
steps = 5000

for step in range(steps):
    # === 1. Get batch ===
    xb, yb = get_batch('train')

    # === 2. Forward ===
    logits, loss = model(xb, yb)

    # === 3. Backward ===
    optimizer.zero_grad()
    loss.backward()

    # === 4. Step ===
    optimizer.step()

    if step % 500 == 0:
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for _ in range(10):
                x_val, y_val = get_batch('val')
                _, loss_val = model(x_val, y_val)
                val_loss += loss_val.item()
        model.train()
        print(f"Step {step} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss/10:.4f}")

9. Generate Text

@torch.no_grad()
def generate(model, idx, max_new_tokens=200):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, idx_next], dim=1)
    model.train()
    return idx

context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
generated = generate(model, context)
print(decode(generated[0].tolist()))

Output:
ROMEO: I will not be a good man...


10. Visualize Computation Graph

from torchviz import make_dot

x = torch.randn(1, 8, 64)
y = torch.randint(0, vocab_size, (1, 8))
logits, loss = model(x, y)
graph = make_dot(loss, params=dict(model.named_parameters()))
graph.render("computation_graph", format="png")

Shows full backward path from loss → embeddings


11. Gradient Flow: Check Health

for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad mean={param.grad.abs().mean():.6f}, std={param.grad.std():.6f}")

Healthy: non-zero, not exploding


12. Learning Rate Scheduling

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps)
# In loop:
scheduler.step()

13. Summary Table

Step Code Purpose
Forward logits, loss = model(x, y) Compute predictions
Loss cross_entropy Measure error
Backward loss.backward() Compute ∇
Step optimizer.step() Update weights
Zero Grad optimizer.zero_grad() Clear old ∇

14. Practice Exercises

  1. Add gradient clipping: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  2. Plot loss curve
  3. Add early stopping
  4. Try SGD vs Adam
  5. Inspect attention maps during training

15. Key Takeaways

Check Insight
Check Training loop = 4 lines
Check Autograd builds graph automatically
Check .backward() = chain rule
Check TinyShakespeare = real language
Check You just trained GPT

Full Copy-Paste Training Script

# === FULL TRAINING SCRIPT ===
import torch
import torch.nn as nn
import torch.nn.functional as F

# [MiniGPT, TransformerBlock, CausalSelfAttention classes here]

# Data
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt').read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
data = torch.tensor(encode(text), dtype=torch.long)
train_data, val_data = data[:int(0.9*len(data))], data[int(0.9*len(data)):]

block_size = 128
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (32,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# Model & Training
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(5000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 1000 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")

# Generate
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
print(decode(model.generate(context, 200)[0].tolist()))

Final Words

You just trained a language model from scratch.
- Forward → Loss → Backward → Step
- Autograd handles the math
- TinyShakespeare learns Shakespeare


End of Module
You now control the full training pipeline.
Next: Pretrain on 1B tokens.

Last updated: Nov 13, 2025

Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare

Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare

Training Loop & Backpropagation

Complete Module: Gradient Descent, Computation Graph, Train on TinyShakespeare


Module Objective

Master the full training loopforward pass, loss, backward pass, optimizer step, computation graph, gradient flow, and train a Mini-GPT on TinyShakespeare.


1. The Training Loop: 4 Steps

for batch in data:
    1. Forward:  logits = model(x)
    2. Loss:     loss = criterion(logits, y)
    3. Backward: loss.backward()          # Compute gradients
    4. Step:     optimizer.step()         # Update weights
         optimizer.zero_grad()            # Clear gradients

2. Computation Graph: Autograd Engine

x = torch.tensor([2.0], requires_grad=True)
y = x ** 2 + 3 * x + 1
y.backward()
print(x.grad)  # tensor([7.]) → dy/dx = 2x + 3

PyTorch builds a dynamic graph
Every .backward() traverses it to compute ∂L/∂w


3. Gradient Descent: The Math

$$
w_{t+1} = w_t - \eta \cdot \nabla_w L
$$

  • $ \eta $: learning rate
  • $ \nabla_w L $: gradient from .backward()

4. Full Training Setup: TinyShakespeare

# === 1. Download Dataset ===
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt', 'r').read()

# === 2. Build Vocabulary ===
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

5. Mini-GPT Model (64-dim)

import torch
import torch.nn as nn
import torch.nn.functional as F

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, n_embd=64, n_head=4, n_layer=4, block_size=128):
        super().__init__()
        self.block_size = block_size
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Linear, nn.Embedding)):
            nn.init.normal_(m.weight, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_emb(idx) + self.pos_emb(torch.arange(T, device=idx.device))
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

6. Transformer Block (Causal)

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttention(n_embd, n_head)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ff = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.n_head = n_head
        self.d_k = n_embd // n_head
        self.Wq = nn.Linear(n_embd, n_embd)
        self.Wk = nn.Linear(n_embd, n_embd)
        self.Wv = nn.Linear(n_embd, n_embd)
        self.Wo = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        B, T, C = x.shape
        q = self.Wq(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
        k = self.Wk(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)
        v = self.Wv(x).view(B, T, self.n_head, self.d_k).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / (self.d_k ** 0.5))
        mask = torch.tril(torch.ones(T, T, device=x.device))
        att = att.masked_fill(mask == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.Wo(y)

7. Data Loader: Get Batch

block_size = 128
batch_size = 32

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

8. Full Training Loop

model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
steps = 5000

for step in range(steps):
    # === 1. Get batch ===
    xb, yb = get_batch('train')

    # === 2. Forward ===
    logits, loss = model(xb, yb)

    # === 3. Backward ===
    optimizer.zero_grad()
    loss.backward()

    # === 4. Step ===
    optimizer.step()

    if step % 500 == 0:
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for _ in range(10):
                x_val, y_val = get_batch('val')
                _, loss_val = model(x_val, y_val)
                val_loss += loss_val.item()
        model.train()
        print(f"Step {step} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss/10:.4f}")

9. Generate Text

@torch.no_grad()
def generate(model, idx, max_new_tokens=200):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, idx_next], dim=1)
    model.train()
    return idx

context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
generated = generate(model, context)
print(decode(generated[0].tolist()))

Output:
ROMEO: I will not be a good man...


10. Visualize Computation Graph

from torchviz import make_dot

x = torch.randn(1, 8, 64)
y = torch.randint(0, vocab_size, (1, 8))
logits, loss = model(x, y)
graph = make_dot(loss, params=dict(model.named_parameters()))
graph.render("computation_graph", format="png")

Shows full backward path from loss → embeddings


11. Gradient Flow: Check Health

for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad mean={param.grad.abs().mean():.6f}, std={param.grad.std():.6f}")

Healthy: non-zero, not exploding


12. Learning Rate Scheduling

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=steps)
# In loop:
scheduler.step()

13. Summary Table

Step Code Purpose
Forward logits, loss = model(x, y) Compute predictions
Loss cross_entropy Measure error
Backward loss.backward() Compute ∇
Step optimizer.step() Update weights
Zero Grad optimizer.zero_grad() Clear old ∇

14. Practice Exercises

  1. Add gradient clipping: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  2. Plot loss curve
  3. Add early stopping
  4. Try SGD vs Adam
  5. Inspect attention maps during training

15. Key Takeaways

Check Insight
Check Training loop = 4 lines
Check Autograd builds graph automatically
Check .backward() = chain rule
Check TinyShakespeare = real language
Check You just trained GPT

Full Copy-Paste Training Script

# === FULL TRAINING SCRIPT ===
import torch
import torch.nn as nn
import torch.nn.functional as F

# [MiniGPT, TransformerBlock, CausalSelfAttention classes here]

# Data
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O tiny.txt
text = open('tiny.txt').read()
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)
data = torch.tensor(encode(text), dtype=torch.long)
train_data, val_data = data[:int(0.9*len(data))], data[int(0.9*len(data)):]

block_size = 128
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (32,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# Model & Training
model = MiniGPT(vocab_size, block_size=block_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for step in range(5000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 1000 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")

# Generate
context = torch.tensor(encode("ROMEO:"), dtype=torch.long).unsqueeze(0)
print(decode(model.generate(context, 200)[0].tolist()))

Final Words

You just trained a language model from scratch.
- Forward → Loss → Backward → Step
- Autograd handles the math
- TinyShakespeare learns Shakespeare


End of Module
You now control the full training pipeline.
Next: Pretrain on 1B tokens.

Last updated: Nov 13, 2025