Section 06

The Code: Implementing RMSNorm and Running LLaMA

LLaMA: Open and Efficient Foundation Language Models 2023

Below are complete Python implementations that run on Google Colab. We’ll implement RMSNorm from scratch and show how to load and run LLaMA.


Code 1: Implementing RMSNorm from Scratch

import torch
import torch.nn as nn
import math

# Implement RMSNorm (Root Mean Square Normalization)
class RMSNorm(nn.Module):
    """RMSNorm: Simpler and faster than LayerNorm."""
    
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps  # Small constant for numerical stability
        self.gamma = nn.Parameter(torch.ones(dim))  # Learnable scale
    
    def forward(self, x):
        # Compute RMS: sqrt(mean(x^2))
        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        # Normalize and scale
        normalized = x / rms
        return normalized * self.gamma

# Compare RMSNorm vs LayerNorm
rms_norm = RMSNorm(4)
layer_norm = nn.LayerNorm(4)

# Test input
x = torch.tensor([[2.0, -1.0, 3.0, 0.0]])

print("Input:", x)
print("RMSNorm output:", rms_norm(x))
print("LayerNorm output:", layer_norm(x))
print("\nKey difference: RMSNorm is simpler (no mean subtraction)")

# Verify RMSNorm computation manually
rms_manual = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True))
print(f"Manual RMS: {rms_manual.item():.4f}")
print(f"Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708")

Expected Output:

Input: tensor([[2., -1., 3., 0.]])
RMSNorm output: tensor([[1.0690, -0.5345, 1.6035,  0.0000]], grad_fn=<MulBackward0>)
LayerNorm output: tensor([[ 0.6325, -1.2649,  1.2649, -0.6325]], grad_fn=<AddBackward0>)

Key difference: RMSNorm is simpler (no mean subtraction)
Manual RMS: 1.8708
Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708

Code 2: Implementing SwiGLU

import torch
import torch.nn as nn

class SwiGLU(nn.Module):
    """SwiGLU: Swish-gated Linear Unit."""
    
    def __init__(self, input_dim, output_dim):
        super().__init__()
        # Project to intermediate (2x because of gating)
        self.proj = nn.Linear(input_dim, 2 * output_dim)
        self.output_dim = output_dim
    
    def forward(self, x):
        # Project to 2*output_dim
        proj = self.proj(x)  # Shape: (..., 2*output_dim)
        
        # Split into two parts
        gate_input = proj[..., :self.output_dim]  # First half
        gate = proj[..., self.output_dim:]  # Second half
        
        # Swish activation: x * sigmoid(x)
        swish = gate_input * torch.sigmoid(gate_input)
        
        # Gate: multiply by the second projection
        output = swish * gate
        
        return output

# Test SwiGLU
swiglu = SwiGLU(input_dim=4, output_dim=8)
x = torch.tensor([[1.5, -0.5, 2.0, 0.3]])
output = swiglu(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)
print("SwiGLU output:", output)
print("\nKey: SwiGLU uses gating (element-wise multiply) for selectivity")

Expected Output:

Input shape: torch.Size([1, 4])
Output shape: torch.Size([1, 8])
SwiGLU output: tensor([...], grad_fn=<MulBackward0>)

Key: SwiGLU uses gating (element-wise multiply) for selectivity

Code 3: Loading and Running LLaMA

# Install transformers library (first time only)
# !pip install transformers torch

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Loading LLaMA-2 tokenizer...")
# Note: LLaMA-2 7B is available on Hugging Face
# We'll use a smaller model for Colab compatibility
model_name = "meta-llama/Llama-2-7b-hf"

# Note: You may need to accept the license on Hugging Face first
# Go to: https://huggingface.co/meta-llama/Llama-2-7b-hf

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("✓ Tokenizer loaded")
    
    # Load model in 8-bit quantization to save memory
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_8bit=True,
        torch_dtype=torch.float16
    )
    print("✓ Model loaded (8-bit quantized)")
    
    # Generate text
    prompt = "The most important breakthrough in AI is"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    print(f"\nPrompt: {prompt}")
    print("Generating...")
    
    outputs = model.generate(
        input_ids,
        max_length=100,
        temperature=0.7,
        do_sample=True
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Output: {generated_text}")
    
except Exception as e:
    print(f"Note: {e}")
    print("\nLLaMA requires accepting the license on Hugging Face Hub first.")
    print("Alternative: Use a smaller open model like Mistral-7B (Paper 18)")

Code 4: Using Mistral-7B (Smaller Alternative)

If LLaMA licensing is an issue, Mistral-7B is fully open and can run on Colab:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Loading Mistral-7B tokenizer...")
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

print("✓ Model loaded")

# Generate text
prompt = "Let's solve a math problem: "
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

outputs = model.generate(
    input_ids,
    max_length=150,
    temperature=0.7,
    top_p=0.9
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated: {generated_text}")

Code 5: Understanding LLaMA’s Architecture

import torch
import torch.nn as nn

# Simplified LLaMA block (what happens inside each transformer layer)
class SimpleLLaMABlock(nn.Module):
    def __init__(self, dim, num_heads):
        super().__init__()
        self.rms_norm1 = RMSNorm(dim)  # Pre-norm
        self.attention = nn.MultiheadAttention(dim, num_heads, batch_first=True)
        
        self.rms_norm2 = RMSNorm(dim)  # Pre-norm
        self.swiglu = SwiGLU(dim, dim * 4)  # FFN with SwiGLU
        
    def forward(self, x):
        # Pre-norm + Attention + Residual
        norm_x = self.rms_norm1(x)
        attn_out, _ = self.attention(norm_x, norm_x, norm_x)
        x = x + attn_out
        
        # Pre-norm + SwiGLU + Residual
        norm_x = self.rms_norm2(x)
        swiglu_out = self.swiglu(norm_x)
        x = x + swiglu_out
        
        return x

# Test the block
dim = 64
num_heads = 8
block = SimpleLLaMABlock(dim, num_heads)

x = torch.randn(1, 10, dim)  # Batch: 1, Seq: 10, Dim: 64
output = block(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print("✓ LLaMA block processed successfully")

How to Run on Google Colab

  1. Go to Google Colab
  2. Create a new notebook
  3. Copy each code block above into separate cells
  4. Run cells in order (Code 1 → Code 2 → Code 3 or 4)
  5. For Code 3 (LLaMA), you’ll need to:

Colab memory tips:

  • Use 8-bit quantization (shown in code) to fit models in Colab’s 16GB GPU
  • For full precision, you may need a Colab Pro account
  • Mistral-7B (Code 4) is smaller and fits more easily

Key Observations from the Code

  1. RMSNorm is simpler: No mean subtraction, just RMS + scale
  2. SwiGLU uses gating: Element-wise multiply allows selective feature use
  3. LLaMA layers stack: Many blocks of Attention + SwiGLU FFN
  4. Pre-normalization: RMSNorm before operations, not after (more stable)
  5. Residual connections: Every layer preserves the original signal via x = x + output

This architecture enabled LLaMA to be more efficient than GPT-3 while achieving better performance.