PyTorch Cheatsheet Cheatsheet

🔥

Tensors & Basics

Core Foundation

PyTorch is a Python-based deep learning framework developed by Meta. Its dynamic computation graph and Pythonic design make it the framework of choice for researchers worldwide.

tensors_basics.py

import torch
import torch.nn as nn
print(torch.__version__)  # e.g., 2.5.0

# ── Creating Tensors ──
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
zeros = torch.zeros(3, 4)
ones = torch.ones(2, 3)
rand = torch.randn(3, 3)  # Normal distribution
randint = torch.randint(0, 10, (3, 3))
eye = torch.eye(3)  # Identity matrix
arange = torch.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
linspace = torch.linspace(0, 1, 5)

# ── Tensor Operations ──
a = torch.randn(3, 3)
b = torch.randn(3, 3)
print(a + b)              # Element-wise addition
print(a * b)              # Element-wise multiplication
print(a @ b)              # Matrix multiplication (or torch.mm, torch.matmul)
print(torch.cat([a, b], dim=0))  # Concatenate
print(a.view(9))          # Reshape (like np.reshape)
print(a.reshape(9, 1))    # Reshape
print(a.permute(1, 0))    # Permute dimensions

# ── GPU Support ──
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device)          # Move to GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA: {torch.version.cuda}")

# ── NumPy Interop ──
import numpy as np
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr)       # NumPy -> Tensor
arr_back = t.numpy()            # Tensor -> NumPy (CPU only)

# ── Autograd ──
x = torch.randn(3, requires_grad=True)
y = x * 2 + 1
z = y.sum()
z.backward()           # Compute gradients
print(x.grad)          # dz/dx = 2 for all elements

PyTorch Tensor Properties

Property	Description	Example
dtype	Data type of tensor	torch.float32, torch.int64, torch.bool
shape	Dimensions of tensor	torch.Size([3, 4])
device	CPU or GPU placement	torch.device('cuda:0')
requires_grad	Track gradients for autograd	True/False
grad	Gradient tensor (after backward)	x.grad after loss.backward()
data	Underlying data tensor	tensor.data for detached copy

📐

Autograd & Gradients

Automatic Differentiation

PyTorch's autograd engine automatically computes gradients using reverse-mode automatic differentiation. It tracks operations on tensors and builds a dynamic computation graph.

autograd_examples.py

import torch

# ── Basic Autograd ──
x = torch.tensor(2.0, requires_grad=True)
y = x ** 3 + 2 * x + 1   # y = x^3 + 2x + 1
y.backward()                # dy/dx = 3x^2 + 2 = 14
print(f"Gradient: {x.grad}")  # tensor(14.)

# ── Gradient Accumulation ──
model.zero_grad()  # Clear gradients before each step

# ── Detach from computation graph ──
x = torch.randn(3, requires_grad=True)
y = x * 2
z = y.detach()      # z has no grad_fn, no gradients tracked

# ── No Gradient (inference mode) ──
with torch.no_grad():
    y = model(x)    # Faster, less memory

# ── torch.inference_mode() (PyTorch 2.0+) ──
with torch.inference_mode():
    y = model(x)    # Even faster than no_grad()

# ── Gradient Clipping ──
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)

# ── Custom backward ──
class MyFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x ** 2

    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        return 2 * x * grad_output

# Usage
y = MyFunction.apply(torch.tensor(3.0, requires_grad=True))
y.backward()

Autograd Key Concepts

Computation GraphBuilt dynamically during forward pass. Each tensor with requires_grad=True tracks its creating operation (grad_fn).

backward()Traverses the graph in reverse, computing gradients via chain rule. Accumulates into .grad attribute.

zero_grad()Must call before each optimizer step to prevent gradient accumulation from previous iterations.

no_grad()Context manager that disables gradient tracking. Essential for inference and evaluation to save memory.

inference_mode()PyTorch 2.0+ replacement for no_grad(). Faster because it also disables view tracking and version counters.

🏗️

nn.Module & Models

Model Building

nn.Module is the base class for all neural network models in PyTorch. It manages parameters, handles device placement, and provides a clean interface for model definition.

nn_module_models.py

import torch
import torch.nn as nn
import torch.nn.functional as F

# ── Simple MLP ──
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.bn1 = nn.BatchNorm1d(hidden_dim)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

# ── CNN for Image Classification ──
class ImageCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# ── ResNet Block ──
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        return F.relu(out + residual)  # Skip connection

# ── Model Utilities ──
model = ImageCNN()
print(model)                    # Print architecture
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Save/Load
torch.save(model.state_dict(), 'model.pth')
model.load_state_dict(torch.load('model.pth', weights_only=True))
model.eval()  # Set to evaluation mode (affects Dropout/BN)

nn.Module Essentials

Method/Attribute	Purpose	Usage
__init__(self)	Define layers and sub-modules	self.fc = nn.Linear(784, 256)
forward(self, x)	Define forward pass computation	return self.fc(x)
parameters()	Iterator over all learnable parameters	optimizer = Adam(model.parameters())
named_parameters()	Parameters with names	for name, p in model.named_parameters()
train()	Set to training mode	model.train() # enables dropout, BN updates
eval()	Set to evaluation mode	model.eval() # disables dropout, BN uses running stats
to(device)	Move model to device	model.to('cuda')
state_dict()	All model weights as dict	torch.save(model.state_dict(), path)

📦

DataLoader & Datasets

Data Loading

PyTorch provides DataLoader and Dataset abstractions for efficient data loading with shuffling, batching, multi-processing, and custom data pipelines.

dataloader.py

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms

# ── Built-in Datasets ──
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225]),
])

train_dataset = datasets.ImageFolder('./data/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                         num_workers=4, pin_memory=True)

# ── Custom Dataset ──
class MyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long),
        }

# ── Train/Val Split ──
dataset = MyDataset(texts, labels, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

# ── Collate Function for variable-length sequences ──
def collate_fn(batch):
    from torch.nn.utils.rnn import pad_sequence
    inputs = pad_sequence([item['input_ids'] for item in batch],
                          batch_first=True, padding_value=0)
    masks = pad_sequence([item['attention_mask'] for item in batch],
                         batch_first=True, padding_value=0)
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': inputs, 'attention_mask': masks, 'labels': labels}

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True,
                          collate_fn=collate_fn)

DataLoader Arguments

Argument	Default	Description	Best Practice
batch_size	1	Number of samples per batch	32 for images, 16-32 for NLP
shuffle	False	Shuffle data each epoch	True for training, False for val/test
num_workers	0	Parallel data loading processes	4-8 on multi-core, 0 for Windows debug
pin_memory	False	Faster CPU-to-GPU transfer	True when using GPU
drop_last	False	Drop last incomplete batch	True for consistent batch norms
persistent_workers	False	Keep workers alive between epochs	True for large datasets

🏋️

Training Loop

Core Training

PyTorch's explicit training loop gives full control over every step: forward pass, loss computation, backpropagation, and optimizer update.

training_loop.py

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MyModel().to(device)

# ── Loss & Optimizer ──
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=100)

# ── Mixed Precision Training ──
from torch.amp import GradScaler, autocast
scaler = GradScaler('cuda')

# ── Training Loop ──
best_val_acc = 0.0
patience = 10
patience_counter = 0

for epoch in range(100):
    # Training Phase
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast('cuda'):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    scheduler.step()

    # Validation Phase
    model.eval()
    correct = total = 0
    val_loss = 0.0
    with torch.inference_mode():
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_acc = 100.0 * correct / total
    print(f"Epoch {epoch}: Train Loss={train_loss/len(train_loader):.4f}, "
          f"Val Acc={val_acc:.2f}%")

    # Early Stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

PyTorch Training Checklist

1. deviceMove model AND data to GPU: model.to(device), inputs.to(device)

2. zero_grad()Clear gradients before each optimizer step

3. train()/eval()Switch modes: train enables dropout/BN, eval disables them

4. inference_modeUse torch.inference_mode() during validation for speed and memory

5. Gradient ClippingPrevent exploding gradients: clip_grad_norm_(params, max_norm=1.0)

6. Mixed PrecisionUse autocast + GradScaler for 2x speed on modern GPUs

7. SchedulerCall scheduler.step() after each epoch (not each batch for most schedulers)

⚡

torch.compile & DDP

Performance

PyTorch 2.0 introduced torch.compile for Just-In-Time compilation, dramatically speeding up training. Distributed Data Parallel (DDP) enables multi-GPU training.

compile_ddp.py

# ── torch.compile (PyTorch 2.0+) ──
# Simple one-line speedup
model = torch.compile(model)

# With options
model = torch.compile(model, mode='reduce-overhead')  # Best for large models
# mode='default' - good balance of compile time vs speed
# mode='max-autotune' - longest compile, fastest runtime

# ── Distributed Data Parallel (DDP) ──
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

class Trainer:
    def __init__(self, model, rank, world_size):
        self.model = model.to(rank)
        self.model = DDP(self.model, device_ids=[rank])
        self.rank = rank
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)

    def train(self, dataloader, epoch):
        self.model.train()
        dataloader.sampler.set_epoch(epoch)
        for batch in dataloader:
            batch = {k: v.to(self.rank) for k, v in batch.items()}
            self.optimizer.zero_grad()
            output = self.model(batch)
            loss = output.loss
            loss.backward()
            self.optimizer.step()

# ── Launch: torchrun --nproc_per_node=4 train.py ──

# ── FSDP (Fully Sharded Data Parallel) for large models ──
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy

auto_wrap_policy = size_based_auto_wrap_policy(min_num_params=1e8)
model = FSDP(model, auto_wrap_policy=auto_wrap_policy)

Multi-GPU Strategies

Strategy	Memory Efficiency	Speed	Use Case
DataParallel (DP)	Low (model replicated)	Moderate	Quick prototyping, single machine, NOT recommended
DistributedDataParallel (DDP)	Moderate (model replicated per GPU)	Good	Standard multi-GPU training
FSDP	High (shards params, grads, optimizer)	Good	Large models that dont fit in GPU memory
DeepSpeed ZeRO	Very High (3 stages of sharding)	Good	Very large models, LLM training
Pipeline Parallelism	High (model split across GPUs)	Moderate	Very large models with sequential layers
Tensor Parallelism	Moderate (tensor ops split)	Good	Megatron-style LLM training

🖼️

Torchvision & Pre-trained Models

Computer Vision

torchvision provides datasets, model architectures, and image transformations for computer vision. It includes dozens of pre-trained models.

torchvision_models.py

import torch
from torchvision import models, transforms

# ── Load Pre-trained Models ──
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
efficientnet = models.efficientnet_b0(weights='IMAGENET1K_V1')
vit = models.vit_b_16(weights='IMAGENET1K_V1')

# ── Modify for Fine-tuning ──
model = models.resnet50(weights='IMAGENET1K_V2')
for param in model.parameters():
    param.requires_grad = False  # Freeze all
for param in model.fc.parameters():
    param.requires_grad = True   # Unfreeze classifier
model.fc = nn.Linear(model.fc.in_features, num_classes)

# ── Image Transforms ──
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                       std=[0.229, 0.224, 0.225]),
])

# ── Object Detection ──
detection_model = models.detection.fasterrcnn_resnet50_fpn(
    weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)

# ── Segmentation ──
seg_model = models.segmentation.fcn_resnet50(
    weights=models.segmentation.FCN_ResNet50_Weights.DEFAULT)

Available Torchvision Models

Category	Models Available	Key Models
Classification	20+ architectures	ResNet, EfficientNet, ViT, ConvNeXt, DenseNet, VGG
Detection	5+ architectures	Faster R-CNN, SSD, RetinaNet, FCOS
Segmentation	5+ architectures	FCN, DeepLabV3, LRASPP
Video	3 architectures	ResNet3D, R3D, S3D, MVIT
Optical Flow	RAFT	RAFT (large/small)

⚡

PyTorch Lightning

High-Level API

PyTorch Lightning abstracts away boilerplate training code while keeping PyTorch's flexibility. It handles GPU/TPU, distributed training, logging, and checkpoints automatically.

lightning_example.py

import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F

class LitClassifier(pl.LightningModule):
    def __init__(self, input_dim=784, hidden_dim=256, num_classes=10, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        return self.classifier(self.encoder(x))

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(1) == y).float().mean()
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(1) == y).float().mean()
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = (logits.argmax(1) == y).float().mean()
        self.log('test_acc', acc)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=50)
        return [optimizer], [scheduler]

# ── Training ──
model = LitClassifier()
trainer = pl.Trainer(
    max_epochs=50,
    accelerator='auto',   # auto-detect GPU/TPU
    devices='auto',
    precision='16-mixed', # Mixed precision
    callbacks=[
        pl.callbacks.EarlyStopping(monitor='val_loss', patience=5),
        pl.callbacks.ModelCheckpoint(monitor='val_acc', mode='max'),
    ],
    logger=pl.loggers.TensorBoardLogger('./logs'),
)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

💡

Lightning benefits: Automatic device placement, distributed training with one line, built-in 16-bit precision, checkpoint management, TensorBoard logging, and gradient accumulation. Write less boilerplate, focus on research.

💬

PyTorch Interview Questions

Top 10

Essential PyTorch interview questions covering fundamentals, training, optimization, and advanced topics.

Q1: PyTorch vs TensorFlow

AnswerPyTorch: Pythonic, dynamic graphs by default, dominant in research, explicit training loop. TensorFlow: Keras for easy prototyping, better production deployment (TF Serving, TF Lite), static graph optimization via @tf.function. PyTorch 2.0+ has narrowed the gap with torch.compile and better deployment tools.

Q2: Explain requires_grad and grad_fn

Answerrequires_grad=True tells PyTorch to track operations on a tensor for gradient computation. grad_fn references the function that created the tensor (e.g., AddBackward, MmBackward). Only leaf tensors (created by user) accumulate gradients. Use .detach() to exclude a tensor from the graph.

Q3: When to call zero_grad()?

AnswerBefore each optimizer.step(). Gradients accumulate by default in PyTorch (useful for gradient accumulation across batches). If you forget zero_grad(), gradients from previous steps will be added to current ones, causing incorrect weight updates.

Q4: model.eval() vs torch.no_grad()

Answermodel.eval() switches BatchNorm to use running statistics and disables Dropout. torch.no_grad() disables gradient computation entirely (saves memory and computation). Use both during inference: model.eval() for correct BN/Dropout behavior, torch.inference_mode() to skip gradient tracking.

Q5: What is torch.compile?

AnswerPyTorch 2.0 feature that JIT-compiles Python model code into optimized kernels using TorchDynamo and Inductor. Speeds up training 1.5-3x by fusing operations, reducing Python overhead, and generating efficient GPU kernels. Modes: default (fast compile), reduce-overhead (best for large models), max-autotune (slowest compile, fastest runtime).

Q6: DDP vs DP vs FSDP

AnswerDataParallel: single-process, multi-thread, GIL bottleneck (not recommended). DDP: multi-process, each GPU has its own model copy, all-reduce for gradient sync (recommended). FSDP: shards model parameters, gradients, and optimizer states across GPUs (for models too large for a single GPU).

Q7: How does mixed precision training work?

AnswerUses float16 for forward pass (2x speed, 2x less memory) but keeps float32 master weights for gradient updates. autocast() context automatically casts ops to FP16. GradScaler() prevents underflow by scaling loss before backward and unscaling before optimizer step. Modern GPUs (Ampere+) have Tensor Cores for fast FP16 matmul.

Q8: Explain state_dict

Answerstate_dict() returns an OrderedDict mapping parameter names to their tensors. It includes model parameters and buffer (BN running stats). Save with torch.save(model.state_dict(), path) and load with model.load_state_dict(torch.load(path)). Only save state_dict, not the entire model (for portability and smaller file size).

Q9: Gradient accumulation

AnswerSimulate large batch sizes on limited GPU memory by accumulating gradients over N small batches before calling optimizer.step(). Omit zero_grad() until after N batches. Scale loss by accumulation_steps for correct gradient magnitudes. Effective batch = micro_batch * accumulation_steps * num_gpus.

Q10: DataLoader num_workers

AnswerControls number of subprocesses for data loading. 0 = main process (slow). 4-8 = good for most setups. Setting too high causes CPU contention and memory overhead. pin_memory=True enables faster CPU-to-GPU transfer. On Windows, set to 0 for debugging (multiprocessing issues).

⏳

Loading cheatsheet...

import torch import torch.nn as nn print(torch.__version__) # e.g., 2.5.0 # ── Creating Tensors ── x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32) zeros = torch.zeros(3, 4) ones = torch.ones(2, 3) rand = torch.randn(3, 3) # Normal distribution randint = torch.randint(0, 10, (3, 3)) eye = torch.eye(3) # Identity matrix arange = torch.arange(0, 10, 2) # [0, 2, 4, 6, 8] linspace = torch.linspace(0, 1, 5) # ── Tensor Operations ── a = torch.randn(3, 3) b = torch.randn(3, 3) print(a + b) # Element-wise addition print(a * b) # Element-wise multiplication print(a @ b) # Matrix multiplication (or torch.mm, torch.matmul) print(torch.cat([a, b], dim=0)) # Concatenate print(a.view(9)) # Reshape (like np.reshape) print(a.reshape(9, 1)) # Reshape print(a.permute(1, 0)) # Permute dimensions # ── GPU Support ── device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') x = x.to(device) # Move to GPU print(f"GPU: {torch.cuda.get_device_name(0)}") print(f"CUDA: {torch.version.cuda}") # ── NumPy Interop ── import numpy as np arr = np.array([1, 2, 3]) t = torch.from_numpy(arr) # NumPy -> Tensor arr_back = t.numpy() # Tensor -> NumPy (CPU only) # ── Autograd ── x = torch.randn(3, requires_grad=True) y = x * 2 + 1 z = y.sum() z.backward() # Compute gradients print(x.grad) # dz/dx = 2 for all elements

Property

Description

Example

dtype

Data type of tensor

torch.float32, torch.int64, torch.bool

shape

Dimensions of tensor

torch.Size([3, 4])

device

CPU or GPU placement

torch.device('cuda:0')

requires_grad

Track gradients for autograd

True/False

grad

Gradient tensor (after backward)

x.grad after loss.backward()

data

Underlying data tensor

tensor.data for detached copy

import torch # ── Basic Autograd ── x = torch.tensor(2.0, requires_grad=True) y = x ** 3 + 2 * x + 1 # y = x^3 + 2x + 1 y.backward() # dy/dx = 3x^2 + 2 = 14 print(f"Gradient: {x.grad}") # tensor(14.) # ── Gradient Accumulation ── model.zero_grad() # Clear gradients before each step # ── Detach from computation graph ── x = torch.randn(3, requires_grad=True) y = x * 2 z = y.detach() # z has no grad_fn, no gradients tracked # ── No Gradient (inference mode) ── with torch.no_grad(): y = model(x) # Faster, less memory # ── torch.inference_mode() (PyTorch 2.0+) ── with torch.inference_mode(): y = model(x) # Even faster than no_grad() # ── Gradient Clipping ── torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5) # ── Custom backward ── class MyFunction(torch.autograd.Function): @staticmethod def forward(ctx, x): ctx.save_for_backward(x) return x ** 2 @staticmethod def backward(ctx, grad_output): x, = ctx.saved_tensors return 2 * x * grad_output # Usage y = MyFunction.apply(torch.tensor(3.0, requires_grad=True)) y.backward()

import torch import torch.nn as nn import torch.nn.functional as F # ── Simple MLP ── class SimpleMLP(nn.Module): def __init__(self, input_dim, hidden_dim, num_classes): super().__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, hidden_dim) self.fc3 = nn.Linear(hidden_dim, num_classes) self.dropout = nn.Dropout(0.3) self.bn1 = nn.BatchNorm1d(hidden_dim) def forward(self, x): x = F.relu(self.bn1(self.fc1(x))) x = self.dropout(x) x = F.relu(self.fc2(x)) x = self.dropout(x) return self.fc3(x) # ── CNN for Image Classification ── class ImageCNN(nn.Module): def __init__(self, num_classes=10): super().__init__() self.features = nn.Sequential( nn.Conv2d(3, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(), nn.AdaptiveAvgPool2d(1), ) self.classifier = nn.Sequential( nn.Flatten(), nn.Linear(128, 256), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, num_classes), ) def forward(self, x): x = self.features(x) x = self.classifier(x) return x # ── ResNet Block ── class ResidualBlock(nn.Module): def __init__(self, channels): super().__init__() self.conv1 = nn.Conv2d(channels, channels, 3, padding=1) self.bn1 = nn.BatchNorm2d(channels) self.conv2 = nn.Conv2d(channels, channels, 3, padding=1) self.bn2 = nn.BatchNorm2d(channels) def forward(self, x): residual = x out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) return F.relu(out + residual) # Skip connection # ── Model Utilities ── model = ImageCNN() print(model) # Print architecture print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") # Save/Load torch.save(model.state_dict(), 'model.pth') model.load_state_dict(torch.load('model.pth', weights_only=True)) model.eval() # Set to evaluation mode (affects Dropout/BN)

Method/Attribute

Purpose

Usage

__init__(self)

Define layers and sub-modules

self.fc = nn.Linear(784, 256)

forward(self, x)

Define forward pass computation

return self.fc(x)

parameters()

Iterator over all learnable parameters

optimizer = Adam(model.parameters())

named_parameters()

Parameters with names

for name, p in model.named_parameters()

train()

Set to training mode

model.train() # enables dropout, BN updates

eval()

Set to evaluation mode

model.eval() # disables dropout, BN uses running stats

to(device)

Move model to device

model.to('cuda')

state_dict()

All model weights as dict

torch.save(model.state_dict(), path)

import torch from torch.utils.data import Dataset, DataLoader, random_split from torchvision import datasets, transforms # ── Built-in Datasets ── transform = transforms.Compose([ transforms.Resize(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) train_dataset = datasets.ImageFolder('./data/train', transform=transform) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True) # ── Custom Dataset ── class MyDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len=128): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) label = self.labels[idx] encoding = self.tokenizer( text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt', ) return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'labels': torch.tensor(label, dtype=torch.long), } # ── Train/Val Split ── dataset = MyDataset(texts, labels, tokenizer) train_size = int(0.8 * len(dataset)) val_size = len(dataset) - train_size train_ds, val_ds = random_split(dataset, [train_size, val_size]) train_loader = DataLoader(train_ds, batch_size=16, shuffle=True) val_loader = DataLoader(val_ds, batch_size=32, shuffle=False) # ── Collate Function for variable-length sequences ── def collate_fn(batch): from torch.nn.utils.rnn import pad_sequence inputs = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=0) masks = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0) labels = torch.tensor([item['labels'] for item in batch]) return {'input_ids': inputs, 'attention_mask': masks, 'labels': labels} train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)

Argument

Default

Description

Best Practice

batch_size

Number of samples per batch

32 for images, 16-32 for NLP

shuffle

False

Shuffle data each epoch

True for training, False for val/test

num_workers

Parallel data loading processes

4-8 on multi-core, 0 for Windows debug

pin_memory

False

Faster CPU-to-GPU transfer

True when using GPU

drop_last

False

Drop last incomplete batch

True for consistent batch norms

persistent_workers

False

Keep workers alive between epochs

True for large datasets

import torch import torch.nn as nn from torch.optim import AdamW from torch.optim.lr_scheduler import CosineAnnealingLR device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = MyModel().to(device) # ── Loss & Optimizer ── criterion = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) scheduler = CosineAnnealingLR(optimizer, T_max=100) # ── Mixed Precision Training ── from torch.amp import GradScaler, autocast scaler = GradScaler('cuda') # ── Training Loop ── best_val_acc = 0.0 patience = 10 patience_counter = 0 for epoch in range(100): # Training Phase model.train() train_loss = 0.0 for batch in train_loader: inputs = batch['input_ids'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() with autocast('cuda'): outputs = model(inputs) loss = criterion(outputs, labels) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) scaler.step(optimizer) scaler.update() train_loss += loss.item() scheduler.step() # Validation Phase model.eval() correct = total = 0 val_loss = 0.0 with torch.inference_mode(): for batch in val_loader: inputs = batch['input_ids'].to(device) labels = batch['labels'].to(device) outputs = model(inputs) loss = criterion(outputs, labels) val_loss += loss.item() _, predicted = outputs.max(1) total += labels.size(0) correct += predicted.eq(labels).sum().item() val_acc = 100.0 * correct / total print(f"Epoch {epoch}: Train Loss={train_loss/len(train_loader):.4f}, " f"Val Acc={val_acc:.2f}%") # Early Stopping if val_acc > best_val_acc: best_val_acc = val_acc torch.save(model.state_dict(), 'best_model.pth') patience_counter = 0 else: patience_counter += 1 if patience_counter >= patience: print(f"Early stopping at epoch {epoch}") break

# ── torch.compile (PyTorch 2.0+) ── # Simple one-line speedup model = torch.compile(model) # With options model = torch.compile(model, mode='reduce-overhead') # Best for large models # mode='default' - good balance of compile time vs speed # mode='max-autotune' - longest compile, fastest runtime # ── Distributed Data Parallel (DDP) ── import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP def setup(rank, world_size): dist.init_process_group("nccl", rank=rank, world_size=world_size) def cleanup(): dist.destroy_process_group() class Trainer: def __init__(self, model, rank, world_size): self.model = model.to(rank) self.model = DDP(self.model, device_ids=[rank]) self.rank = rank self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4) def train(self, dataloader, epoch): self.model.train() dataloader.sampler.set_epoch(epoch) for batch in dataloader: batch = {k: v.to(self.rank) for k, v in batch.items()} self.optimizer.zero_grad() output = self.model(batch) loss = output.loss loss.backward() self.optimizer.step() # ── Launch: torchrun --nproc_per_node=4 train.py ── # ── FSDP (Fully Sharded Data Parallel) for large models ── from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy auto_wrap_policy = size_based_auto_wrap_policy(min_num_params=1e8) model = FSDP(model, auto_wrap_policy=auto_wrap_policy)

Strategy

Memory Efficiency

Speed

Use Case

DataParallel (DP)

Low (model replicated)

Moderate

Quick prototyping, single machine, NOT recommended

DistributedDataParallel (DDP)

Moderate (model replicated per GPU)

Good

Standard multi-GPU training

FSDP

High (shards params, grads, optimizer)

Good

Large models that dont fit in GPU memory

DeepSpeed ZeRO

Very High (3 stages of sharding)

Good

Very large models, LLM training

Pipeline Parallelism

High (model split across GPUs)

Moderate

Very large models with sequential layers

Tensor Parallelism

Moderate (tensor ops split)

Good

Megatron-style LLM training

import torch from torchvision import models, transforms # ── Load Pre-trained Models ── resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2) efficientnet = models.efficientnet_b0(weights='IMAGENET1K_V1') vit = models.vit_b_16(weights='IMAGENET1K_V1') # ── Modify for Fine-tuning ── model = models.resnet50(weights='IMAGENET1K_V2') for param in model.parameters(): param.requires_grad = False # Freeze all for param in model.fc.parameters(): param.requires_grad = True # Unfreeze classifier model.fc = nn.Linear(model.fc.in_features, num_classes) # ── Image Transforms ── train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), transforms.RandomRotation(15), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # ── Object Detection ── detection_model = models.detection.fasterrcnn_resnet50_fpn( weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT) # ── Segmentation ── seg_model = models.segmentation.fcn_resnet50( weights=models.segmentation.FCN_ResNet50_Weights.DEFAULT)