The Code: Implementing RMSNorm and Running LLaMA — LLaMA: Open and Efficient Foundation Language Models

Below are complete Python implementations that run on Google Colab. We’ll implement RMSNorm from scratch and show how to load and run LLaMA.

Code 1: Implementing RMSNorm from Scratch

import torch
import torch.nn as nn
import math

# Implement RMSNorm (Root Mean Square Normalization)
class RMSNorm(nn.Module):
    """RMSNorm: Simpler and faster than LayerNorm."""
    
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps  # Small constant for numerical stability
        self.gamma = nn.Parameter(torch.ones(dim))  # Learnable scale
    
    def forward(self, x):
        # Compute RMS: sqrt(mean(x^2))
        rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
        # Normalize and scale
        normalized = x / rms
        return normalized * self.gamma

# Compare RMSNorm vs LayerNorm
rms_norm = RMSNorm(4)
layer_norm = nn.LayerNorm(4)

# Test input
x = torch.tensor([[2.0, -1.0, 3.0, 0.0]])

print("Input:", x)
print("RMSNorm output:", rms_norm(x))
print("LayerNorm output:", layer_norm(x))
print("\nKey difference: RMSNorm is simpler (no mean subtraction)")

# Verify RMSNorm computation manually
rms_manual = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True))
print(f"Manual RMS: {rms_manual.item():.4f}")
print(f"Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708")

Expected Output:

Input: tensor([[2., -1., 3., 0.]])
RMSNorm output: tensor([[1.0690, -0.5345, 1.6035,  0.0000]], grad_fn=<MulBackward0>)
LayerNorm output: tensor([[ 0.6325, -1.2649,  1.2649, -0.6325]], grad_fn=<AddBackward0>)

Key difference: RMSNorm is simpler (no mean subtraction)
Manual RMS: 1.8708
Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708

Code 2: Implementing SwiGLU

import torch
import torch.nn as nn

class SwiGLU(nn.Module):
    """SwiGLU: Swish-gated Linear Unit."""
    
    def __init__(self, input_dim, output_dim):
        super().__init__()
        # Project to intermediate (2x because of gating)
        self.proj = nn.Linear(input_dim, 2 * output_dim)
        self.output_dim = output_dim
    
    def forward(self, x):
        # Project to 2*output_dim
        proj = self.proj(x)  # Shape: (..., 2*output_dim)
        
        # Split into two parts
        gate_input = proj[..., :self.output_dim]  # First half
        gate = proj[..., self.output_dim:]  # Second half
        
        # Swish activation: x * sigmoid(x)
        swish = gate_input * torch.sigmoid(gate_input)
        
        # Gate: multiply by the second projection
        output = swish * gate
        
        return output

# Test SwiGLU
swiglu = SwiGLU(input_dim=4, output_dim=8)
x = torch.tensor([[1.5, -0.5, 2.0, 0.3]])
output = swiglu(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)
print("SwiGLU output:", output)
print("\nKey: SwiGLU uses gating (element-wise multiply) for selectivity")

Expected Output:

Input shape: torch.Size([1, 4])
Output shape: torch.Size([1, 8])
SwiGLU output: tensor([...], grad_fn=<MulBackward0>)

Key: SwiGLU uses gating (element-wise multiply) for selectivity

Code 3: Loading and Running LLaMA

# Install transformers library (first time only)
# !pip install transformers torch

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Loading LLaMA-2 tokenizer...")
# Note: LLaMA-2 7B is available on Hugging Face
# We'll use a smaller model for Colab compatibility
model_name = "meta-llama/Llama-2-7b-hf"

# Note: You may need to accept the license on Hugging Face first
# Go to: https://huggingface.co/meta-llama/Llama-2-7b-hf

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("✓ Tokenizer loaded")
    
    # Load model in 8-bit quantization to save memory
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_8bit=True,
        torch_dtype=torch.float16
    )
    print("✓ Model loaded (8-bit quantized)")
    
    # Generate text
    prompt = "The most important breakthrough in AI is"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    print(f"\nPrompt: {prompt}")
    print("Generating...")
    
    outputs = model.generate(
        input_ids,
        max_length=100,
        temperature=0.7,
        do_sample=True
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Output: {generated_text}")
    
except Exception as e:
    print(f"Note: {e}")
    print("\nLLaMA requires accepting the license on Hugging Face Hub first.")
    print("Alternative: Use a smaller open model like Mistral-7B (Paper 18)")

Code 4: Using Mistral-7B (Smaller Alternative)

If LLaMA licensing is an issue, Mistral-7B is fully open and can run on Colab:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Loading Mistral-7B tokenizer...")
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

print("✓ Model loaded")

# Generate text
prompt = "Let's solve a math problem: "
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

outputs = model.generate(
    input_ids,
    max_length=150,
    temperature=0.7,
    top_p=0.9
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated: {generated_text}")

Code 5: Understanding LLaMA’s Architecture

import torch
import torch.nn as nn

# Simplified LLaMA block (what happens inside each transformer layer)
class SimpleLLaMABlock(nn.Module):
    def __init__(self, dim, num_heads):
        super().__init__()
        self.rms_norm1 = RMSNorm(dim)  # Pre-norm
        self.attention = nn.MultiheadAttention(dim, num_heads, batch_first=True)
        
        self.rms_norm2 = RMSNorm(dim)  # Pre-norm
        self.swiglu = SwiGLU(dim, dim * 4)  # FFN with SwiGLU
        
    def forward(self, x):
        # Pre-norm + Attention + Residual
        norm_x = self.rms_norm1(x)
        attn_out, _ = self.attention(norm_x, norm_x, norm_x)
        x = x + attn_out
        
        # Pre-norm + SwiGLU + Residual
        norm_x = self.rms_norm2(x)
        swiglu_out = self.swiglu(norm_x)
        x = x + swiglu_out
        
        return x

# Test the block
dim = 64
num_heads = 8
block = SimpleLLaMABlock(dim, num_heads)

x = torch.randn(1, 10, dim)  # Batch: 1, Seq: 10, Dim: 64
output = block(x)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print("✓ LLaMA block processed successfully")

How to Run on Google Colab

Go to Google Colab
Create a new notebook
Copy each code block above into separate cells
Run cells in order (Code 1 → Code 2 → Code 3 or 4)
For Code 3 (LLaMA), you’ll need to:
- Accept the license at https://huggingface.co/meta-llama/Llama-2-7b-hf
- Authenticate with: huggingface-cli login

Colab memory tips:

Use 8-bit quantization (shown in code) to fit models in Colab’s 16GB GPU
For full precision, you may need a Colab Pro account
Mistral-7B (Code 4) is smaller and fits more easily

Key Observations from the Code

RMSNorm is simpler: No mean subtraction, just RMS + scale
SwiGLU uses gating: Element-wise multiply allows selective feature use
LLaMA layers stack: Many blocks of Attention + SwiGLU FFN
Pre-normalization: RMSNorm before operations, not after (more stable)
Residual connections: Every layer preserves the original signal via x = x + output

This architecture enabled LLaMA to be more efficient than GPT-3 while achieving better performance.