R Language Cheatsheet Cheatsheet

🔢

Vectors & Basic Types

FUNDAMENTAL

r/vectors.R

# ── Vectors ──
x <- c(1, 2, 3, 4, 5)        # numeric vector
y <- c("a", "b", "c")        # character vector
z <- c(TRUE, FALSE, TRUE)    # logical vector
n <- 1:10                     # integer sequence 1 to 10
s <- seq(0, 1, by = 0.1)      # sequence with step
r <- seq_len(5)               # 1 2 3 4 5
rep(0, 10)                    # repeat 0 ten times
rep(1:3, each = 2)            # 1 1 2 2 3 3
rep(1:3, times = 3)           # 1 2 3 1 2 3 1 2 3

# Vector operations
x + 1            # 2 3 4 5 6  (element-wise)
x * 2            # 2 4 6 8 10
x^2              # 1 4 9 16 25
sum(x)           # 15
mean(x)          # 3
sd(x)            # standard deviation
var(x)           # variance
min(x), max(x)   # 1, 5
length(x)        # 5
sort(x)          # 1 2 3 4 5
rev(x)           # 5 4 3 2 1
unique(x)        # remove duplicates
cumsum(x)        # 1 3 6 10 15
cumprod(x)       # 1 2 6 24 120

# Indexing (1-based!)
x[1]             # first element (1)
x[3]             # third element (3)
x[c(1, 3, 5)]   # elements at positions 1, 3, 5
x[-2]            # all except second
x[x > 3]        # elements > 3: 4 5
which(x > 3)     # indices: 4 5

# Naming
names(x) <- c("a", "b", "c", "d", "e")
x["a"]           # 1

# ── Basic Types ──
class(42)        # "numeric"
class("hello")   # "character"
class(TRUE)      # "logical"
is.numeric(42)   # TRUE
is.character("hi") # TRUE
as.numeric("42") # 42
as.character(42) # "42"
as.logical(1)    # TRUE
NA               # missing value
NULL             # null object
Inf, -Inf        # infinity
NaN              # not a number

Data Types

Type	Example	Check
numeric	3.14, 42	is.numeric()
integer	1L, 42L	is.integer()
character	"hello"	is.character()
logical	TRUE, FALSE	is.logical()
complex	1+2i	is.complex()
factor	factor("low")	is.factor()
Date	as.Date("2024-01-15")	is.Date()

Missing Values

Function	Purpose
is.na(x)	TRUE if NA
is.null(x)	TRUE if NULL
na.rm = TRUE	Remove NA in functions
complete.cases(df)	Rows with no NA
drop_na()	tidyr: remove NA rows
replace_na()	tidyr: replace NA
coalesce(x, y)	First non-NA value

📊

Data Frames

DATA

r/dataframes.R

# ── Create Data Frame ──
df <- data.frame(
  name  = c("Alice", "Bob", "Carol", "Dave"),
  age   = c(30, 25, 35, 28),
  score = c(95.5, 87.0, 92.3, 78.5),
  city  = c("NYC", "LA", "NYC", "SF"),
  stringsAsFactors = FALSE
)

# ── Inspect ──
head(df, 3)      # first 3 rows
tail(df, 2)      # last 2 rows
str(df)          # structure
summary(df)      # summary statistics
dim(df)          # rows x cols
nrow(df), ncol(df)
names(df)        # column names

# ── Access Columns ──
df$name           # $ notation
df[["name"]]      # single bracket (data frame)
df[, "name"]      # single bracket (vector)
df[, 1]           # by position

# ── Access Rows ──
df[1, ]           # first row
df[df$age > 28, ] # filtered rows
df[1:3, ]         # rows 1-3

# ── Access Cells ──
df[1, "name"]     # "Alice"
df[2, 3]          # 87.0

# ── Add Columns ──
df$grade <- ifelse(df$score >= 90, "A",
              ifelse(df$score >= 80, "B", "C"))
df$active <- TRUE
df$pct_rank <- rank(df$score) / nrow(df)

# ── Filter Rows ──
df[df$age > 28 & df$city == "NYC", ]
subset(df, age > 28 & city == "NYC")

# ── Sort ──
df[order(df$score, decreasing = TRUE), ]
df[order(df$city, df$score), ]

# ── Aggregation ──
aggregate(score ~ city, data = df, FUN = mean)
tapply(df$score, df$city, mean)
by(df$score, df$city, mean)

# ── Merge (join) ──
left <- data.frame(id = 1:3, val = c("a", "b", "c"))
right <- data.frame(id = 2:4, score = c(90, 85, 88))
merge(left, right, by = "id", all.x = TRUE)   # left join
merge(left, right, by = "id", all = TRUE)     # full join

# ── Apply Functions ──
sapply(df[, c("age", "score")], mean)
lapply(df, class)
apply(df[, c("age", "score")], 2, sum)

⚠️

R uses 1-based indexing unlike Python (0-based) and Julia (1-based). df[1, ] gets the first row. Use dplyr for cleaner data manipulation instead of base R subsetting.

🧹

dplyr & Tidyverse

MANIPULATION

r/dplyr.R

library(dplyr)
library(tidyr)

# ── Pipe Operator (%>%) ──
df %>%
  filter(age > 25) %>%
  select(name, age, score) %>%
  arrange(desc(score))

# ── filter(): rows ──
df %>% filter(age > 25)
df %>% filter(age > 25, city == "NYC")
df %>% filter(between(age, 25, 35))
df %>% filter(city %in% c("NYC", "SF"))

# ── select(): columns ──
df %>% select(name, age)
df %>% select(-city)                    # remove column
df %>% select(starts_with("s"))         # starts with "s"
df %>% select(ends_with("e"))           # ends with "e"
df %>% select(name:score)               # name through score
df %>% select(where(is.numeric))        # all numeric cols
df %>% rename(full_name = name)          # rename column

# ── mutate(): new columns ──
df %>% mutate(
  age_next = age + 1,
  score_pct = score / max(score) * 100,
  age_group = case_when(
    age < 30 ~ "young",
    age < 40 ~ "mid",
    TRUE ~ "senior"
  )
)
df %>% transmute(score_pct = score / max(score) * 100)

# ── arrange(): sort ──
df %>% arrange(age)                     # ascending
df %>% arrange(desc(score))             # descending
df %>% arrange(city, desc(score))       # multi-column

# ── summarise() + group_by() ──
df %>%
  group_by(city) %>%
  summarise(
    count = n(),
    avg_age = mean(age),
    avg_score = mean(score),
    max_score = max(score),
    .groups = "drop"
  )

# ── Window functions ──
df %>%
  group_by(city) %>%
  mutate(
    city_avg = mean(score),
    rank_in_city = rank(desc(score)),
    pct = score / sum(score) * 100
  )

# ── join functions ──
df1 %>% inner_join(df2, by = "id")
df1 %>% left_join(df2, by = "id")
df1 %>% right_join(df2, by = "id")
df1 %>% full_join(df2, by = "id")
df1 %>% semi_join(df2, by = "id")    # WHERE EXISTS
df1 %>% anti_join(df2, by = "id")    # WHERE NOT EXISTS

# ── tidyr: reshape ──
long %>% pivot_wider(names_from = key, values_from = value)
wide %>% pivot_longer(cols = -id, names_to = "key", values_to = "value")

dplyr Verbs

Verb	Purpose	SQL Equivalent
filter()	Select rows	WHERE
select()	Select columns	SELECT
mutate()	Add/transform columns	computed column
arrange()	Sort	ORDER BY
summarise()	Aggregate	GROUP BY + agg
group_by()	Group data	GROUP BY
rename()	Rename columns	AS
distinct()	Unique rows	SELECT DISTINCT

Join Types

Function	Behavior
inner_join()	Matching rows from both
left_join()	All from left, matching from right
right_join()	All from right, matching from left
full_join()	All rows from both
semi_join()	Rows from left that match right
anti_join()	Rows from left that DO NOT match

📈

ggplot2 Visualization

PLOTTING

r/ggplot2.R

library(ggplot2)

# ── Grammar of Graphics ──
# data + aesthetics + geometry + facets + theme

# ── Scatter Plot ──
ggplot(df, aes(x = age, y = score, color = city)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Age vs Score", x = "Age", y = "Score") +
  theme_minimal()

# ── Bar Chart ──
ggplot(df, aes(x = city, fill = city)) +
  geom_bar() +
  labs(title = "Count by City") +
  theme_bw()

# ── Histogram ──
ggplot(df, aes(x = score)) +
  geom_histogram(bins = 20, fill = "steelblue", color = "white") +
  labs(title = "Score Distribution", x = "Score", y = "Count")

# ── Box Plot ──
ggplot(df, aes(x = city, y = score, fill = city)) +
  geom_boxplot() +
  labs(title = "Score by City") +
  theme_classic()

# ── Line Chart ──
ggplot(df, aes(x = date, y = value, color = group)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(title = "Trend Over Time")

# ── Facets ──
ggplot(df, aes(x = age, y = score)) +
  geom_point() +
  facet_wrap(~ city, ncol = 2) +
  theme_minimal()

# ── Save Plot ──
ggsave("plot.png", width = 8, height = 6, dpi = 300)
ggsave("plot.pdf", width = 8, height = 6)

# ── Common Theme Customization ──
theme_minimal()
theme_bw()
theme_classic()
theme_void()

# Custom theme
+ theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text = element_text(size = 10),
    legend.position = "bottom"
  )

Geoms Reference

Geom	Purpose
geom_point()	Scatter plot
geom_line()	Line chart
geom_bar()	Bar chart (count)
geom_col()	Bar chart (values)
geom_histogram()	Histogram
geom_boxplot()	Box plot
geom_violin()	Violin plot
geom_density()	Density plot
geom_smooth()	Trend line
geom_text()	Text labels
geom_tile()	Heatmap

Color Scales

Scale	Use Case
scale_color_manual()	Custom discrete colors
scale_color_brewer()	ColorBrewer palettes
scale_color_gradient()	Continuous gradient
scale_fill_viridis_c()	Viridis continuous
theme(legend.position)	Legend: "none", "bottom"

📊

Statistics

ANALYSIS

r/statistics.R

# ── Descriptive Statistics ──
mean(x)           # arithmetic mean
median(x)         # median
sd(x)             # standard deviation (sample)
var(x)            # variance (sample)
IQR(x)            # interquartile range
quantile(x, 0.25) # 25th percentile
summary(x)        # min, Q1, median, mean, Q3, max
cor(x, y)         # Pearson correlation
cor.test(x, y)    # correlation test with p-value

# ── T-Tests ──
# One sample: is mean different from mu?
t.test(x, mu = 0)

# Two-sample (independent)
t.test(group_a, group_b)

# Paired t-test
t.test(before, after, paired = TRUE)

# ── Chi-Square Test ──
chisq.test(table(data$observed, data$expected))

# ── ANOVA ──
# One-way ANOVA
aov_result <- aov(score ~ group, data = df)
summary(aov_result)

# Post-hoc test
TukeyHSD(aov_result)

# ── Linear Regression ──
model <- lm(score ~ age + study_hours, data = df)
summary(model)         # coefficients, R-squared, p-values
coef(model)            # regression coefficients
predict(model, newdata = data.frame(age = 30, study_hours = 10))

# ── Logistic Regression ──
model_logit <- glm(passed ~ age + score, data = df, family = binomial)
summary(model_logit)
predict(model_logit, type = "response")  # predicted probabilities

# ── Distributions ──
dnorm(0)          # density: 0.3989
pnorm(1.96)       # CDF: 0.975
qnorm(0.975)      # quantile: 1.96
rnorm(100, 0, 1)  # 100 random normal values

dunif(0.5, 0, 1)  # density uniform
dbinom(5, 10, 0.5) # density binomial
dpois(3, lambda = 5) # density Poisson

💡

R is built for statistics. Functions like t.test(), lm(), and anova() provide comprehensive output including test statistics, p-values, and confidence intervals. Use summary() on any model to see the full output.

⚡

Functions & Apply Family

PROGRAMMING

r/functions.R

# ── Function Definition ──
add <- function(a, b) {
  return(a + b)
}
add(3, 4)   # 7

# Default arguments
greet <- function(name, greeting = "Hello") {
  paste(greeting, name)
}
greet("Alice")         # "Hello Alice"
greet("Bob", "Hi")     # "Hi Bob"

# Ellipsis (...) for passing args
my_plot <- function(data, ...) {
  plot(data, ...)
}

# ── Apply Family ──
# apply: over matrix margins (1=rows, 2=cols)
mat <- matrix(1:12, nrow = 3, ncol = 4)
apply(mat, 1, sum)     # row sums
apply(mat, 2, mean)    # column means

# lapply: returns list
lapply(1:5, function(x) x^2)
# [[1]] 1 [[2]] 4 [[3]] 9 ...

# sapply: simplified (vector/matrix)
sapply(1:5, function(x) x^2)
# [1] 1 4 9 16 25

# mapply: multivariate apply
mapply(sum, 1:5, 6:10)
# [1] 7 9 11 13 15

# vapply: with type safety
vapply(1:5, function(x) x^2, numeric(1))

# tapply: apply by factor
tapply(df$score, df$city, mean)

# ── purrr map functions (tidyverse) ──
library(purrr)
map(1:5, ~ .x^2)          # list
map_dbl(1:5, ~ .x^2)     # numeric vector
map_chr(1:5, ~ paste0("n", .x))  # character vector
map_lgl(1:5, ~ .x > 3)   # logical vector

walk(files, ~ read_csv(.x))  # side-effect only
map2(1:5, letters[1:5], paste)  # two inputs

# ── Conditional: ifelse / case_when ──
ifelse(x > 0, "positive", "non-positive")

dplyr::case_when(
  x < 0  ~ "negative",
  x == 0 ~ "zero",
  TRUE   ~ "positive"
)

# ── Error Handling ──
safe_mean <- safely(function(x) mean(x))
safe_mean(c(1, 2, "three"))  # list(result = NULL, error = ...)

💾

Data I/O

FILE

r/io.R

# ── CSV ──
df <- read.csv("data.csv", header = TRUE, stringsAsFactors = FALSE)
readr::read_csv("data.csv")           # faster, guesses types
write.csv(df, "output.csv", row.names = FALSE)
readr::write_csv(df, "output.csv")

# ── Excel ──
library(readxl)
df <- read_excel("data.xlsx", sheet = 1)
library(writexl)
write_xlsx(df, "output.xlsx")

# ── JSON ──
library(jsonlite)
data <- fromJSON("data.json")           # parse JSON
toJSON(df, pretty = TRUE)               # to JSON

# ── RDS (native R format) ──
saveRDS(df, "data.rds")
df <- readRDS("data.rds")

# ── RData (multiple objects) ──
save(df, model, file = "workspace.RData")
load("workspace.RData")

# ── Databases ──
library(DBI)
con <- dbConnect(RSQLite::SQLite(), "mydb.sqlite")
dbWriteTable(con, "users", df)
result <- dbGetQuery(con, "SELECT * FROM users")
dbDisconnect(con)

# ── Parquet (fast columnar) ──
library(arrow)
df <- read_parquet("data.parquet")
write_parquet(df, "output.parquet")

🎯

Interview Q&A

PREP

Q: What is the difference between a vector and a list?A vector is a homogeneous 1D array (all elements same type). A list is a heterogeneous container that can hold any type of element. Vectors are atomic; lists are recursive. Use c() for vectors, list() for lists.

Q: Explain the difference between =, <-, and <<-.<- is the standard assignment (preferred). = works in most contexts but has different scoping rules inside functions. <<- assigns to the parent environment (used in closures). Always use <- for clarity.

Q: What is an R factor and when should you use it?A factor stores categorical data as integers with labels. Used for statistical modeling (ANOVA, regression) and plotting. Use factor() or forcats. Watch for ordered factors: factor(x, ordered = TRUE, levels = c('low','med','high')).

Q: How does NA differ from NULL?NA means 'missing data' — a value that exists but is unknown. NULL means 'no value' or absence of an object. Operations on NA return NA (use na.rm=TRUE). Operations on NULL return NULL. is.na(NA) is TRUE; is.null(NULL) is TRUE.

Q: What is the difference between melt() and dcast()?reshape2::melt() converts wide to long format (unpivots). reshape2::dcast() converts long to wide (pivots). In tidyr: pivot_longer() = melt, pivot_wider() = dcast. Tidy data is typically long format.

Q: Explain the ggplot2 grammar of graphics.ggplot2 builds plots in layers: data -> aesthetics (aes) -> geometry (geom) -> statistics (stat) -> facets -> theme. Each layer adds to the visualization. aes() maps data columns to visual properties (x, y, color, size).

💡

Top R interview topics: data types (vector vs list vs factor), NA vs NULL, dplyr verbs, ggplot2 grammar, statistical tests (t-test, ANOVA, regression), apply family, data reshaping, and tidy data principles.

⏳

Loading cheatsheet...

# ── Vectors ── x <- c(1, 2, 3, 4, 5) # numeric vector y <- c("a", "b", "c") # character vector z <- c(TRUE, FALSE, TRUE) # logical vector n <- 1:10 # integer sequence 1 to 10 s <- seq(0, 1, by = 0.1) # sequence with step r <- seq_len(5) # 1 2 3 4 5 rep(0, 10) # repeat 0 ten times rep(1:3, each = 2) # 1 1 2 2 3 3 rep(1:3, times = 3) # 1 2 3 1 2 3 1 2 3 # Vector operations x + 1 # 2 3 4 5 6 (element-wise) x * 2 # 2 4 6 8 10 x^2 # 1 4 9 16 25 sum(x) # 15 mean(x) # 3 sd(x) # standard deviation var(x) # variance min(x), max(x) # 1, 5 length(x) # 5 sort(x) # 1 2 3 4 5 rev(x) # 5 4 3 2 1 unique(x) # remove duplicates cumsum(x) # 1 3 6 10 15 cumprod(x) # 1 2 6 24 120 # Indexing (1-based!) x[1] # first element (1) x[3] # third element (3) x[c(1, 3, 5)] # elements at positions 1, 3, 5 x[-2] # all except second x[x > 3] # elements > 3: 4 5 which(x > 3) # indices: 4 5 # Naming names(x) <- c("a", "b", "c", "d", "e") x["a"] # 1 # ── Basic Types ── class(42) # "numeric" class("hello") # "character" class(TRUE) # "logical" is.numeric(42) # TRUE is.character("hi") # TRUE as.numeric("42") # 42 as.character(42) # "42" as.logical(1) # TRUE NA # missing value NULL # null object Inf, -Inf # infinity NaN # not a number

Type

Example

Check

numeric

3.14, 42

is.numeric()

integer

1L, 42L

is.integer()

character

"hello"

is.character()

logical

TRUE, FALSE

is.logical()

complex

1+2i

is.complex()

factor

factor("low")

is.factor()

Date

as.Date("2024-01-15")

is.Date()

Function

Purpose

is.na(x)

TRUE if NA

is.null(x)

TRUE if NULL

na.rm = TRUE

Remove NA in functions

complete.cases(df)

Rows with no NA

drop_na()

tidyr: remove NA rows

replace_na()

tidyr: replace NA

coalesce(x, y)

First non-NA value

# ── Create Data Frame ── df <- data.frame( name = c("Alice", "Bob", "Carol", "Dave"), age = c(30, 25, 35, 28), score = c(95.5, 87.0, 92.3, 78.5), city = c("NYC", "LA", "NYC", "SF"), stringsAsFactors = FALSE ) # ── Inspect ── head(df, 3) # first 3 rows tail(df, 2) # last 2 rows str(df) # structure summary(df) # summary statistics dim(df) # rows x cols nrow(df), ncol(df) names(df) # column names # ── Access Columns ── df$name # $ notation df[["name"]] # single bracket (data frame) df[, "name"] # single bracket (vector) df[, 1] # by position # ── Access Rows ── df[1, ] # first row df[df$age > 28, ] # filtered rows df[1:3, ] # rows 1-3 # ── Access Cells ── df[1, "name"] # "Alice" df[2, 3] # 87.0 # ── Add Columns ── df$grade <- ifelse(df$score >= 90, "A", ifelse(df$score >= 80, "B", "C")) df$active <- TRUE df$pct_rank <- rank(df$score) / nrow(df) # ── Filter Rows ── df[df$age > 28 & df$city == "NYC", ] subset(df, age > 28 & city == "NYC") # ── Sort ── df[order(df$score, decreasing = TRUE), ] df[order(df$city, df$score), ] # ── Aggregation ── aggregate(score ~ city, data = df, FUN = mean) tapply(df$score, df$city, mean) by(df$score, df$city, mean) # ── Merge (join) ── left <- data.frame(id = 1:3, val = c("a", "b", "c")) right <- data.frame(id = 2:4, score = c(90, 85, 88)) merge(left, right, by = "id", all.x = TRUE) # left join merge(left, right, by = "id", all = TRUE) # full join # ── Apply Functions ── sapply(df[, c("age", "score")], mean) lapply(df, class) apply(df[, c("age", "score")], 2, sum)

library(dplyr) library(tidyr) # ── Pipe Operator (%>%) ── df %>% filter(age > 25) %>% select(name, age, score) %>% arrange(desc(score)) # ── filter(): rows ── df %>% filter(age > 25) df %>% filter(age > 25, city == "NYC") df %>% filter(between(age, 25, 35)) df %>% filter(city %in% c("NYC", "SF")) # ── select(): columns ── df %>% select(name, age) df %>% select(-city) # remove column df %>% select(starts_with("s")) # starts with "s" df %>% select(ends_with("e")) # ends with "e" df %>% select(name:score) # name through score df %>% select(where(is.numeric)) # all numeric cols df %>% rename(full_name = name) # rename column # ── mutate(): new columns ── df %>% mutate( age_next = age + 1, score_pct = score / max(score) * 100, age_group = case_when( age < 30 ~ "young", age < 40 ~ "mid", TRUE ~ "senior" ) ) df %>% transmute(score_pct = score / max(score) * 100) # ── arrange(): sort ── df %>% arrange(age) # ascending df %>% arrange(desc(score)) # descending df %>% arrange(city, desc(score)) # multi-column # ── summarise() + group_by() ── df %>% group_by(city) %>% summarise( count = n(), avg_age = mean(age), avg_score = mean(score), max_score = max(score), .groups = "drop" ) # ── Window functions ── df %>% group_by(city) %>% mutate( city_avg = mean(score), rank_in_city = rank(desc(score)), pct = score / sum(score) * 100 ) # ── join functions ── df1 %>% inner_join(df2, by = "id") df1 %>% left_join(df2, by = "id") df1 %>% right_join(df2, by = "id") df1 %>% full_join(df2, by = "id") df1 %>% semi_join(df2, by = "id") # WHERE EXISTS df1 %>% anti_join(df2, by = "id") # WHERE NOT EXISTS # ── tidyr: reshape ── long %>% pivot_wider(names_from = key, values_from = value) wide %>% pivot_longer(cols = -id, names_to = "key", values_to = "value")

Verb

Purpose

SQL Equivalent

filter()

Select rows

WHERE

select()

Select columns

SELECT

mutate()

Add/transform columns

computed column

arrange()

Sort

ORDER BY

summarise()

Aggregate

GROUP BY + agg

group_by()

Group data

GROUP BY

rename()

Rename columns

distinct()

Unique rows

SELECT DISTINCT

Function

Behavior

inner_join()

Matching rows from both

left_join()

All from left, matching from right

right_join()

All from right, matching from left

full_join()

All rows from both

semi_join()

Rows from left that match right

anti_join()

Rows from left that DO NOT match

library(ggplot2) # ── Grammar of Graphics ── # data + aesthetics + geometry + facets + theme # ── Scatter Plot ── ggplot(df, aes(x = age, y = score, color = city)) + geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE) + labs(title = "Age vs Score", x = "Age", y = "Score") + theme_minimal() # ── Bar Chart ── ggplot(df, aes(x = city, fill = city)) + geom_bar() + labs(title = "Count by City") + theme_bw() # ── Histogram ── ggplot(df, aes(x = score)) + geom_histogram(bins = 20, fill = "steelblue", color = "white") + labs(title = "Score Distribution", x = "Score", y = "Count") # ── Box Plot ── ggplot(df, aes(x = city, y = score, fill = city)) + geom_boxplot() + labs(title = "Score by City") + theme_classic() # ── Line Chart ── ggplot(df, aes(x = date, y = value, color = group)) + geom_line(size = 1) + geom_point(size = 2) + labs(title = "Trend Over Time") # ── Facets ── ggplot(df, aes(x = age, y = score)) + geom_point() + facet_wrap(~ city, ncol = 2) + theme_minimal() # ── Save Plot ── ggsave("plot.png", width = 8, height = 6, dpi = 300) ggsave("plot.pdf", width = 8, height = 6) # ── Common Theme Customization ── theme_minimal() theme_bw() theme_classic() theme_void() # Custom theme + theme( plot.title = element_text(hjust = 0.5, size = 16), axis.text = element_text(size = 10), legend.position = "bottom" )

Geom

Purpose

geom_point()

Scatter plot

geom_line()

Line chart

geom_bar()

Bar chart (count)

geom_col()

Bar chart (values)

geom_histogram()

Histogram

geom_boxplot()

Box plot

geom_violin()

Violin plot

geom_density()

Density plot

geom_smooth()

Trend line

geom_text()

Text labels

geom_tile()

Heatmap

Scale

Use Case

scale_color_manual()

Custom discrete colors

scale_color_brewer()

ColorBrewer palettes

scale_color_gradient()

Continuous gradient

scale_fill_viridis_c()

Viridis continuous

theme(legend.position)

Legend: "none", "bottom"

# ── Descriptive Statistics ── mean(x) # arithmetic mean median(x) # median sd(x) # standard deviation (sample) var(x) # variance (sample) IQR(x) # interquartile range quantile(x, 0.25) # 25th percentile summary(x) # min, Q1, median, mean, Q3, max cor(x, y) # Pearson correlation cor.test(x, y) # correlation test with p-value # ── T-Tests ── # One sample: is mean different from mu? t.test(x, mu = 0) # Two-sample (independent) t.test(group_a, group_b) # Paired t-test t.test(before, after, paired = TRUE) # ── Chi-Square Test ── chisq.test(table(data$observed, data$expected)) # ── ANOVA ── # One-way ANOVA aov_result <- aov(score ~ group, data = df) summary(aov_result) # Post-hoc test TukeyHSD(aov_result) # ── Linear Regression ── model <- lm(score ~ age + study_hours, data = df) summary(model) # coefficients, R-squared, p-values coef(model) # regression coefficients predict(model, newdata = data.frame(age = 30, study_hours = 10)) # ── Logistic Regression ── model_logit <- glm(passed ~ age + score, data = df, family = binomial) summary(model_logit) predict(model_logit, type = "response") # predicted probabilities # ── Distributions ── dnorm(0) # density: 0.3989 pnorm(1.96) # CDF: 0.975 qnorm(0.975) # quantile: 1.96 rnorm(100, 0, 1) # 100 random normal values dunif(0.5, 0, 1) # density uniform dbinom(5, 10, 0.5) # density binomial dpois(3, lambda = 5) # density Poisson

# ── Function Definition ── add <- function(a, b) { return(a + b) } add(3, 4) # 7 # Default arguments greet <- function(name, greeting = "Hello") { paste(greeting, name) } greet("Alice") # "Hello Alice" greet("Bob", "Hi") # "Hi Bob" # Ellipsis (...) for passing args my_plot <- function(data, ...) { plot(data, ...) } # ── Apply Family ── # apply: over matrix margins (1=rows, 2=cols) mat <- matrix(1:12, nrow = 3, ncol = 4) apply(mat, 1, sum) # row sums apply(mat, 2, mean) # column means # lapply: returns list lapply(1:5, function(x) x^2) # [[1]] 1 [[2]] 4 [[3]] 9 ... # sapply: simplified (vector/matrix) sapply(1:5, function(x) x^2) # [1] 1 4 9 16 25 # mapply: multivariate apply mapply(sum, 1:5, 6:10) # [1] 7 9 11 13 15 # vapply: with type safety vapply(1:5, function(x) x^2, numeric(1)) # tapply: apply by factor tapply(df$score, df$city, mean) # ── purrr map functions (tidyverse) ── library(purrr) map(1:5, ~ .x^2) # list map_dbl(1:5, ~ .x^2) # numeric vector map_chr(1:5, ~ paste0("n", .x)) # character vector map_lgl(1:5, ~ .x > 3) # logical vector walk(files, ~ read_csv(.x)) # side-effect only map2(1:5, letters[1:5], paste) # two inputs # ── Conditional: ifelse / case_when ── ifelse(x > 0, "positive", "non-positive") dplyr::case_when( x < 0 ~ "negative", x == 0 ~ "zero", TRUE ~ "positive" ) # ── Error Handling ── safe_mean <- safely(function(x) mean(x)) safe_mean(c(1, 2, "three")) # list(result = NULL, error = ...)

# ── CSV ── df <- read.csv("data.csv", header = TRUE, stringsAsFactors = FALSE) readr::read_csv("data.csv") # faster, guesses types write.csv(df, "output.csv", row.names = FALSE) readr::write_csv(df, "output.csv") # ── Excel ── library(readxl) df <- read_excel("data.xlsx", sheet = 1) library(writexl) write_xlsx(df, "output.xlsx") # ── JSON ── library(jsonlite) data <- fromJSON("data.json") # parse JSON toJSON(df, pretty = TRUE) # to JSON # ── RDS (native R format) ── saveRDS(df, "data.rds") df <- readRDS("data.rds") # ── RData (multiple objects) ── save(df, model, file = "workspace.RData") load("workspace.RData") # ── Databases ── library(DBI) con <- dbConnect(RSQLite::SQLite(), "mydb.sqlite") dbWriteTable(con, "users", df) result <- dbGetQuery(con, "SELECT * FROM users") dbDisconnect(con) # ── Parquet (fast columnar) ── library(arrow) df <- read_parquet("data.parquet") write_parquet(df, "output.parquet")