⏳
Loading cheatsheet...
Data manipulation with dplyr, visualization with ggplot2, tidyverse, statistical tests, and data frames.
# ── Vectors ──
x <- c(1, 2, 3, 4, 5) # numeric vector
y <- c("a", "b", "c") # character vector
z <- c(TRUE, FALSE, TRUE) # logical vector
n <- 1:10 # integer sequence 1 to 10
s <- seq(0, 1, by = 0.1) # sequence with step
r <- seq_len(5) # 1 2 3 4 5
rep(0, 10) # repeat 0 ten times
rep(1:3, each = 2) # 1 1 2 2 3 3
rep(1:3, times = 3) # 1 2 3 1 2 3 1 2 3
# Vector operations
x + 1 # 2 3 4 5 6 (element-wise)
x * 2 # 2 4 6 8 10
x^2 # 1 4 9 16 25
sum(x) # 15
mean(x) # 3
sd(x) # standard deviation
var(x) # variance
min(x), max(x) # 1, 5
length(x) # 5
sort(x) # 1 2 3 4 5
rev(x) # 5 4 3 2 1
unique(x) # remove duplicates
cumsum(x) # 1 3 6 10 15
cumprod(x) # 1 2 6 24 120
# Indexing (1-based!)
x[1] # first element (1)
x[3] # third element (3)
x[c(1, 3, 5)] # elements at positions 1, 3, 5
x[-2] # all except second
x[x > 3] # elements > 3: 4 5
which(x > 3) # indices: 4 5
# Naming
names(x) <- c("a", "b", "c", "d", "e")
x["a"] # 1
# ── Basic Types ──
class(42) # "numeric"
class("hello") # "character"
class(TRUE) # "logical"
is.numeric(42) # TRUE
is.character("hi") # TRUE
as.numeric("42") # 42
as.character(42) # "42"
as.logical(1) # TRUE
NA # missing value
NULL # null object
Inf, -Inf # infinity
NaN # not a number| Type | Example | Check |
|---|---|---|
| numeric | 3.14, 42 | is.numeric() |
| integer | 1L, 42L | is.integer() |
| character | "hello" | is.character() |
| logical | TRUE, FALSE | is.logical() |
| complex | 1+2i | is.complex() |
| factor | factor("low") | is.factor() |
| Date | as.Date("2024-01-15") | is.Date() |
| Function | Purpose |
|---|---|
| is.na(x) | TRUE if NA |
| is.null(x) | TRUE if NULL |
| na.rm = TRUE | Remove NA in functions |
| complete.cases(df) | Rows with no NA |
| drop_na() | tidyr: remove NA rows |
| replace_na() | tidyr: replace NA |
| coalesce(x, y) | First non-NA value |
# ── Create Data Frame ──
df <- data.frame(
name = c("Alice", "Bob", "Carol", "Dave"),
age = c(30, 25, 35, 28),
score = c(95.5, 87.0, 92.3, 78.5),
city = c("NYC", "LA", "NYC", "SF"),
stringsAsFactors = FALSE
)
# ── Inspect ──
head(df, 3) # first 3 rows
tail(df, 2) # last 2 rows
str(df) # structure
summary(df) # summary statistics
dim(df) # rows x cols
nrow(df), ncol(df)
names(df) # column names
# ── Access Columns ──
df$name # $ notation
df[["name"]] # single bracket (data frame)
df[, "name"] # single bracket (vector)
df[, 1] # by position
# ── Access Rows ──
df[1, ] # first row
df[df$age > 28, ] # filtered rows
df[1:3, ] # rows 1-3
# ── Access Cells ──
df[1, "name"] # "Alice"
df[2, 3] # 87.0
# ── Add Columns ──
df$grade <- ifelse(df$score >= 90, "A",
ifelse(df$score >= 80, "B", "C"))
df$active <- TRUE
df$pct_rank <- rank(df$score) / nrow(df)
# ── Filter Rows ──
df[df$age > 28 & df$city == "NYC", ]
subset(df, age > 28 & city == "NYC")
# ── Sort ──
df[order(df$score, decreasing = TRUE), ]
df[order(df$city, df$score), ]
# ── Aggregation ──
aggregate(score ~ city, data = df, FUN = mean)
tapply(df$score, df$city, mean)
by(df$score, df$city, mean)
# ── Merge (join) ──
left <- data.frame(id = 1:3, val = c("a", "b", "c"))
right <- data.frame(id = 2:4, score = c(90, 85, 88))
merge(left, right, by = "id", all.x = TRUE) # left join
merge(left, right, by = "id", all = TRUE) # full join
# ── Apply Functions ──
sapply(df[, c("age", "score")], mean)
lapply(df, class)
apply(df[, c("age", "score")], 2, sum)df[1, ] gets the first row. Use dplyr for cleaner data manipulation instead of base R subsetting.library(dplyr)
library(tidyr)
# ── Pipe Operator (%>%) ──
df %>%
filter(age > 25) %>%
select(name, age, score) %>%
arrange(desc(score))
# ── filter(): rows ──
df %>% filter(age > 25)
df %>% filter(age > 25, city == "NYC")
df %>% filter(between(age, 25, 35))
df %>% filter(city %in% c("NYC", "SF"))
# ── select(): columns ──
df %>% select(name, age)
df %>% select(-city) # remove column
df %>% select(starts_with("s")) # starts with "s"
df %>% select(ends_with("e")) # ends with "e"
df %>% select(name:score) # name through score
df %>% select(where(is.numeric)) # all numeric cols
df %>% rename(full_name = name) # rename column
# ── mutate(): new columns ──
df %>% mutate(
age_next = age + 1,
score_pct = score / max(score) * 100,
age_group = case_when(
age < 30 ~ "young",
age < 40 ~ "mid",
TRUE ~ "senior"
)
)
df %>% transmute(score_pct = score / max(score) * 100)
# ── arrange(): sort ──
df %>% arrange(age) # ascending
df %>% arrange(desc(score)) # descending
df %>% arrange(city, desc(score)) # multi-column
# ── summarise() + group_by() ──
df %>%
group_by(city) %>%
summarise(
count = n(),
avg_age = mean(age),
avg_score = mean(score),
max_score = max(score),
.groups = "drop"
)
# ── Window functions ──
df %>%
group_by(city) %>%
mutate(
city_avg = mean(score),
rank_in_city = rank(desc(score)),
pct = score / sum(score) * 100
)
# ── join functions ──
df1 %>% inner_join(df2, by = "id")
df1 %>% left_join(df2, by = "id")
df1 %>% right_join(df2, by = "id")
df1 %>% full_join(df2, by = "id")
df1 %>% semi_join(df2, by = "id") # WHERE EXISTS
df1 %>% anti_join(df2, by = "id") # WHERE NOT EXISTS
# ── tidyr: reshape ──
long %>% pivot_wider(names_from = key, values_from = value)
wide %>% pivot_longer(cols = -id, names_to = "key", values_to = "value")| Verb | Purpose | SQL Equivalent |
|---|---|---|
| filter() | Select rows | WHERE |
| select() | Select columns | SELECT |
| mutate() | Add/transform columns | computed column |
| arrange() | Sort | ORDER BY |
| summarise() | Aggregate | GROUP BY + agg |
| group_by() | Group data | GROUP BY |
| rename() | Rename columns | AS |
| distinct() | Unique rows | SELECT DISTINCT |
| Function | Behavior |
|---|---|
| inner_join() | Matching rows from both |
| left_join() | All from left, matching from right |
| right_join() | All from right, matching from left |
| full_join() | All rows from both |
| semi_join() | Rows from left that match right |
| anti_join() | Rows from left that DO NOT match |
library(ggplot2)
# ── Grammar of Graphics ──
# data + aesthetics + geometry + facets + theme
# ── Scatter Plot ──
ggplot(df, aes(x = age, y = score, color = city)) +
geom_point(size = 3, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Age vs Score", x = "Age", y = "Score") +
theme_minimal()
# ── Bar Chart ──
ggplot(df, aes(x = city, fill = city)) +
geom_bar() +
labs(title = "Count by City") +
theme_bw()
# ── Histogram ──
ggplot(df, aes(x = score)) +
geom_histogram(bins = 20, fill = "steelblue", color = "white") +
labs(title = "Score Distribution", x = "Score", y = "Count")
# ── Box Plot ──
ggplot(df, aes(x = city, y = score, fill = city)) +
geom_boxplot() +
labs(title = "Score by City") +
theme_classic()
# ── Line Chart ──
ggplot(df, aes(x = date, y = value, color = group)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "Trend Over Time")
# ── Facets ──
ggplot(df, aes(x = age, y = score)) +
geom_point() +
facet_wrap(~ city, ncol = 2) +
theme_minimal()
# ── Save Plot ──
ggsave("plot.png", width = 8, height = 6, dpi = 300)
ggsave("plot.pdf", width = 8, height = 6)
# ── Common Theme Customization ──
theme_minimal()
theme_bw()
theme_classic()
theme_void()
# Custom theme
+ theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.text = element_text(size = 10),
legend.position = "bottom"
)| Geom | Purpose |
|---|---|
| geom_point() | Scatter plot |
| geom_line() | Line chart |
| geom_bar() | Bar chart (count) |
| geom_col() | Bar chart (values) |
| geom_histogram() | Histogram |
| geom_boxplot() | Box plot |
| geom_violin() | Violin plot |
| geom_density() | Density plot |
| geom_smooth() | Trend line |
| geom_text() | Text labels |
| geom_tile() | Heatmap |
| Scale | Use Case |
|---|---|
| scale_color_manual() | Custom discrete colors |
| scale_color_brewer() | ColorBrewer palettes |
| scale_color_gradient() | Continuous gradient |
| scale_fill_viridis_c() | Viridis continuous |
| theme(legend.position) | Legend: "none", "bottom" |
# ── Descriptive Statistics ──
mean(x) # arithmetic mean
median(x) # median
sd(x) # standard deviation (sample)
var(x) # variance (sample)
IQR(x) # interquartile range
quantile(x, 0.25) # 25th percentile
summary(x) # min, Q1, median, mean, Q3, max
cor(x, y) # Pearson correlation
cor.test(x, y) # correlation test with p-value
# ── T-Tests ──
# One sample: is mean different from mu?
t.test(x, mu = 0)
# Two-sample (independent)
t.test(group_a, group_b)
# Paired t-test
t.test(before, after, paired = TRUE)
# ── Chi-Square Test ──
chisq.test(table(data$observed, data$expected))
# ── ANOVA ──
# One-way ANOVA
aov_result <- aov(score ~ group, data = df)
summary(aov_result)
# Post-hoc test
TukeyHSD(aov_result)
# ── Linear Regression ──
model <- lm(score ~ age + study_hours, data = df)
summary(model) # coefficients, R-squared, p-values
coef(model) # regression coefficients
predict(model, newdata = data.frame(age = 30, study_hours = 10))
# ── Logistic Regression ──
model_logit <- glm(passed ~ age + score, data = df, family = binomial)
summary(model_logit)
predict(model_logit, type = "response") # predicted probabilities
# ── Distributions ──
dnorm(0) # density: 0.3989
pnorm(1.96) # CDF: 0.975
qnorm(0.975) # quantile: 1.96
rnorm(100, 0, 1) # 100 random normal values
dunif(0.5, 0, 1) # density uniform
dbinom(5, 10, 0.5) # density binomial
dpois(3, lambda = 5) # density Poissont.test(), lm(), and anova() provide comprehensive output including test statistics, p-values, and confidence intervals. Use summary() on any model to see the full output.# ── Function Definition ──
add <- function(a, b) {
return(a + b)
}
add(3, 4) # 7
# Default arguments
greet <- function(name, greeting = "Hello") {
paste(greeting, name)
}
greet("Alice") # "Hello Alice"
greet("Bob", "Hi") # "Hi Bob"
# Ellipsis (...) for passing args
my_plot <- function(data, ...) {
plot(data, ...)
}
# ── Apply Family ──
# apply: over matrix margins (1=rows, 2=cols)
mat <- matrix(1:12, nrow = 3, ncol = 4)
apply(mat, 1, sum) # row sums
apply(mat, 2, mean) # column means
# lapply: returns list
lapply(1:5, function(x) x^2)
# [[1]] 1 [[2]] 4 [[3]] 9 ...
# sapply: simplified (vector/matrix)
sapply(1:5, function(x) x^2)
# [1] 1 4 9 16 25
# mapply: multivariate apply
mapply(sum, 1:5, 6:10)
# [1] 7 9 11 13 15
# vapply: with type safety
vapply(1:5, function(x) x^2, numeric(1))
# tapply: apply by factor
tapply(df$score, df$city, mean)
# ── purrr map functions (tidyverse) ──
library(purrr)
map(1:5, ~ .x^2) # list
map_dbl(1:5, ~ .x^2) # numeric vector
map_chr(1:5, ~ paste0("n", .x)) # character vector
map_lgl(1:5, ~ .x > 3) # logical vector
walk(files, ~ read_csv(.x)) # side-effect only
map2(1:5, letters[1:5], paste) # two inputs
# ── Conditional: ifelse / case_when ──
ifelse(x > 0, "positive", "non-positive")
dplyr::case_when(
x < 0 ~ "negative",
x == 0 ~ "zero",
TRUE ~ "positive"
)
# ── Error Handling ──
safe_mean <- safely(function(x) mean(x))
safe_mean(c(1, 2, "three")) # list(result = NULL, error = ...)# ── CSV ──
df <- read.csv("data.csv", header = TRUE, stringsAsFactors = FALSE)
readr::read_csv("data.csv") # faster, guesses types
write.csv(df, "output.csv", row.names = FALSE)
readr::write_csv(df, "output.csv")
# ── Excel ──
library(readxl)
df <- read_excel("data.xlsx", sheet = 1)
library(writexl)
write_xlsx(df, "output.xlsx")
# ── JSON ──
library(jsonlite)
data <- fromJSON("data.json") # parse JSON
toJSON(df, pretty = TRUE) # to JSON
# ── RDS (native R format) ──
saveRDS(df, "data.rds")
df <- readRDS("data.rds")
# ── RData (multiple objects) ──
save(df, model, file = "workspace.RData")
load("workspace.RData")
# ── Databases ──
library(DBI)
con <- dbConnect(RSQLite::SQLite(), "mydb.sqlite")
dbWriteTable(con, "users", df)
result <- dbGetQuery(con, "SELECT * FROM users")
dbDisconnect(con)
# ── Parquet (fast columnar) ──
library(arrow)
df <- read_parquet("data.parquet")
write_parquet(df, "output.parquet")