Tensors, autograd, neural networks, training loops, data loaders, CUDA, and production deployment.
PyTorch is a Python-based deep learning framework developed by Meta. Its dynamic computation graph and Pythonic design make it the framework of choice for researchers worldwide.
import torch
import torch.nn as nn
print(torch.__version__) # e.g., 2.5.0
# ── Creating Tensors ──
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
zeros = torch.zeros(3, 4)
ones = torch.ones(2, 3)
rand = torch.randn(3, 3) # Normal distribution
randint = torch.randint(0, 10, (3, 3))
eye = torch.eye(3) # Identity matrix
arange = torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = torch.linspace(0, 1, 5)
# ── Tensor Operations ──
a = torch.randn(3, 3)
b = torch.randn(3, 3)
print(a + b) # Element-wise addition
print(a * b) # Element-wise multiplication
print(a @ b) # Matrix multiplication (or torch.mm, torch.matmul)
print(torch.cat([a, b], dim=0)) # Concatenate
print(a.view(9)) # Reshape (like np.reshape)
print(a.reshape(9, 1)) # Reshape
print(a.permute(1, 0)) # Permute dimensions
# ── GPU Support ──
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = x.to(device) # Move to GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA: {torch.version.cuda}")
# ── NumPy Interop ──
import numpy as np
arr = np.array([1, 2, 3])
t = torch.from_numpy(arr) # NumPy -> Tensor
arr_back = t.numpy() # Tensor -> NumPy (CPU only)
# ── Autograd ──
x = torch.randn(3, requires_grad=True)
y = x * 2 + 1
z = y.sum()
z.backward() # Compute gradients
print(x.grad) # dz/dx = 2 for all elements| Property | Description | Example |
|---|---|---|
| dtype | Data type of tensor | torch.float32, torch.int64, torch.bool |
| shape | Dimensions of tensor | torch.Size([3, 4]) |
| device | CPU or GPU placement | torch.device('cuda:0') |
| requires_grad | Track gradients for autograd | True/False |
| grad | Gradient tensor (after backward) | x.grad after loss.backward() |
| data | Underlying data tensor | tensor.data for detached copy |
PyTorch's autograd engine automatically computes gradients using reverse-mode automatic differentiation. It tracks operations on tensors and builds a dynamic computation graph.
import torch
# ── Basic Autograd ──
x = torch.tensor(2.0, requires_grad=True)
y = x ** 3 + 2 * x + 1 # y = x^3 + 2x + 1
y.backward() # dy/dx = 3x^2 + 2 = 14
print(f"Gradient: {x.grad}") # tensor(14.)
# ── Gradient Accumulation ──
model.zero_grad() # Clear gradients before each step
# ── Detach from computation graph ──
x = torch.randn(3, requires_grad=True)
y = x * 2
z = y.detach() # z has no grad_fn, no gradients tracked
# ── No Gradient (inference mode) ──
with torch.no_grad():
y = model(x) # Faster, less memory
# ── torch.inference_mode() (PyTorch 2.0+) ──
with torch.inference_mode():
y = model(x) # Even faster than no_grad()
# ── Gradient Clipping ──
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=0.5)
# ── Custom backward ──
class MyFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
ctx.save_for_backward(x)
return x ** 2
@staticmethod
def backward(ctx, grad_output):
x, = ctx.saved_tensors
return 2 * x * grad_output
# Usage
y = MyFunction.apply(torch.tensor(3.0, requires_grad=True))
y.backward()nn.Module is the base class for all neural network models in PyTorch. It manages parameters, handles device placement, and provides a clean interface for model definition.
import torch
import torch.nn as nn
import torch.nn.functional as F
# ── Simple MLP ──
class SimpleMLP(nn.Module):
def __init__(self, input_dim, hidden_dim, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, num_classes)
self.dropout = nn.Dropout(0.3)
self.bn1 = nn.BatchNorm1d(hidden_dim)
def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
return self.fc3(x)
# ── CNN for Image Classification ──
class ImageCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# ── ResNet Block ──
class ResidualBlock(nn.Module):
def __init__(self, channels):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
return F.relu(out + residual) # Skip connection
# ── Model Utilities ──
model = ImageCNN()
print(model) # Print architecture
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# Save/Load
torch.save(model.state_dict(), 'model.pth')
model.load_state_dict(torch.load('model.pth', weights_only=True))
model.eval() # Set to evaluation mode (affects Dropout/BN)| Method/Attribute | Purpose | Usage |
|---|---|---|
| __init__(self) | Define layers and sub-modules | self.fc = nn.Linear(784, 256) |
| forward(self, x) | Define forward pass computation | return self.fc(x) |
| parameters() | Iterator over all learnable parameters | optimizer = Adam(model.parameters()) |
| named_parameters() | Parameters with names | for name, p in model.named_parameters() |
| train() | Set to training mode | model.train() # enables dropout, BN updates |
| eval() | Set to evaluation mode | model.eval() # disables dropout, BN uses running stats |
| to(device) | Move model to device | model.to('cuda') |
| state_dict() | All model weights as dict | torch.save(model.state_dict(), path) |
PyTorch provides DataLoader and Dataset abstractions for efficient data loading with shuffling, batching, multi-processing, and custom data pipelines.
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
# ── Built-in Datasets ──
transform = transforms.Compose([
transforms.Resize(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
train_dataset = datasets.ImageFolder('./data/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
num_workers=4, pin_memory=True)
# ── Custom Dataset ──
class MyDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = self.tokenizer(
text, max_length=self.max_len, padding='max_length',
truncation=True, return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(label, dtype=torch.long),
}
# ── Train/Val Split ──
dataset = MyDataset(texts, labels, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
# ── Collate Function for variable-length sequences ──
def collate_fn(batch):
from torch.nn.utils.rnn import pad_sequence
inputs = pad_sequence([item['input_ids'] for item in batch],
batch_first=True, padding_value=0)
masks = pad_sequence([item['attention_mask'] for item in batch],
batch_first=True, padding_value=0)
labels = torch.tensor([item['labels'] for item in batch])
return {'input_ids': inputs, 'attention_mask': masks, 'labels': labels}
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True,
collate_fn=collate_fn)| Argument | Default | Description | Best Practice |
|---|---|---|---|
| batch_size | 1 | Number of samples per batch | 32 for images, 16-32 for NLP |
| shuffle | False | Shuffle data each epoch | True for training, False for val/test |
| num_workers | 0 | Parallel data loading processes | 4-8 on multi-core, 0 for Windows debug |
| pin_memory | False | Faster CPU-to-GPU transfer | True when using GPU |
| drop_last | False | Drop last incomplete batch | True for consistent batch norms |
| persistent_workers | False | Keep workers alive between epochs | True for large datasets |
PyTorch's explicit training loop gives full control over every step: forward pass, loss computation, backpropagation, and optimizer update.
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MyModel().to(device)
# ── Loss & Optimizer ──
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=100)
# ── Mixed Precision Training ──
from torch.amp import GradScaler, autocast
scaler = GradScaler('cuda')
# ── Training Loop ──
best_val_acc = 0.0
patience = 10
patience_counter = 0
for epoch in range(100):
# Training Phase
model.train()
train_loss = 0.0
for batch in train_loader:
inputs = batch['input_ids'].to(device)
labels = batch['labels'].to(device)
optimizer.zero_grad()
with autocast('cuda'):
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
train_loss += loss.item()
scheduler.step()
# Validation Phase
model.eval()
correct = total = 0
val_loss = 0.0
with torch.inference_mode():
for batch in val_loader:
inputs = batch['input_ids'].to(device)
labels = batch['labels'].to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
val_acc = 100.0 * correct / total
print(f"Epoch {epoch}: Train Loss={train_loss/len(train_loader):.4f}, "
f"Val Acc={val_acc:.2f}%")
# Early Stopping
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_model.pth')
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
print(f"Early stopping at epoch {epoch}")
breakPyTorch 2.0 introduced torch.compile for Just-In-Time compilation, dramatically speeding up training. Distributed Data Parallel (DDP) enables multi-GPU training.
# ── torch.compile (PyTorch 2.0+) ──
# Simple one-line speedup
model = torch.compile(model)
# With options
model = torch.compile(model, mode='reduce-overhead') # Best for large models
# mode='default' - good balance of compile time vs speed
# mode='max-autotune' - longest compile, fastest runtime
# ── Distributed Data Parallel (DDP) ──
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class Trainer:
def __init__(self, model, rank, world_size):
self.model = model.to(rank)
self.model = DDP(self.model, device_ids=[rank])
self.rank = rank
self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
def train(self, dataloader, epoch):
self.model.train()
dataloader.sampler.set_epoch(epoch)
for batch in dataloader:
batch = {k: v.to(self.rank) for k, v in batch.items()}
self.optimizer.zero_grad()
output = self.model(batch)
loss = output.loss
loss.backward()
self.optimizer.step()
# ── Launch: torchrun --nproc_per_node=4 train.py ──
# ── FSDP (Fully Sharded Data Parallel) for large models ──
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
auto_wrap_policy = size_based_auto_wrap_policy(min_num_params=1e8)
model = FSDP(model, auto_wrap_policy=auto_wrap_policy)| Strategy | Memory Efficiency | Speed | Use Case |
|---|---|---|---|
| DataParallel (DP) | Low (model replicated) | Moderate | Quick prototyping, single machine, NOT recommended |
| DistributedDataParallel (DDP) | Moderate (model replicated per GPU) | Good | Standard multi-GPU training |
| FSDP | High (shards params, grads, optimizer) | Good | Large models that dont fit in GPU memory |
| DeepSpeed ZeRO | Very High (3 stages of sharding) | Good | Very large models, LLM training |
| Pipeline Parallelism | High (model split across GPUs) | Moderate | Very large models with sequential layers |
| Tensor Parallelism | Moderate (tensor ops split) | Good | Megatron-style LLM training |
torchvision provides datasets, model architectures, and image transformations for computer vision. It includes dozens of pre-trained models.
import torch
from torchvision import models, transforms
# ── Load Pre-trained Models ──
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
efficientnet = models.efficientnet_b0(weights='IMAGENET1K_V1')
vit = models.vit_b_16(weights='IMAGENET1K_V1')
# ── Modify for Fine-tuning ──
model = models.resnet50(weights='IMAGENET1K_V2')
for param in model.parameters():
param.requires_grad = False # Freeze all
for param in model.fc.parameters():
param.requires_grad = True # Unfreeze classifier
model.fc = nn.Linear(model.fc.in_features, num_classes)
# ── Image Transforms ──
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.RandomRotation(15),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
val_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
# ── Object Detection ──
detection_model = models.detection.fasterrcnn_resnet50_fpn(
weights=models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
# ── Segmentation ──
seg_model = models.segmentation.fcn_resnet50(
weights=models.segmentation.FCN_ResNet50_Weights.DEFAULT)| Category | Models Available | Key Models |
|---|---|---|
| Classification | 20+ architectures | ResNet, EfficientNet, ViT, ConvNeXt, DenseNet, VGG |
| Detection | 5+ architectures | Faster R-CNN, SSD, RetinaNet, FCOS |
| Segmentation | 5+ architectures | FCN, DeepLabV3, LRASPP |
| Video | 3 architectures | ResNet3D, R3D, S3D, MVIT |
| Optical Flow | RAFT | RAFT (large/small) |
PyTorch Lightning abstracts away boilerplate training code while keeping PyTorch's flexibility. It handles GPU/TPU, distributed training, logging, and checkpoints automatically.
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
class LitClassifier(pl.LightningModule):
def __init__(self, input_dim=784, hidden_dim=256, num_classes=10, lr=1e-3):
super().__init__()
self.save_hyperparameters()
self.encoder = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
)
self.classifier = nn.Linear(hidden_dim, num_classes)
def forward(self, x):
return self.classifier(self.encoder(x))
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
acc = (logits.argmax(1) == y).float().mean()
self.log('train_loss', loss, prog_bar=True)
self.log('train_acc', acc, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
acc = (logits.argmax(1) == y).float().mean()
self.log('val_loss', loss, prog_bar=True)
self.log('val_acc', acc, prog_bar=True)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
acc = (logits.argmax(1) == y).float().mean()
self.log('test_acc', acc)
def configure_optimizers(self):
optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=50)
return [optimizer], [scheduler]
# ── Training ──
model = LitClassifier()
trainer = pl.Trainer(
max_epochs=50,
accelerator='auto', # auto-detect GPU/TPU
devices='auto',
precision='16-mixed', # Mixed precision
callbacks=[
pl.callbacks.EarlyStopping(monitor='val_loss', patience=5),
pl.callbacks.ModelCheckpoint(monitor='val_acc', mode='max'),
],
logger=pl.loggers.TensorBoardLogger('./logs'),
)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)Essential PyTorch interview questions covering fundamentals, training, optimization, and advanced topics.