Pandas Cheatsheet Cheatsheet

⚡

DataFrame & Series Creation

FUNDAMENTALS

creation_basics.py

import pandas as pd
import numpy as np

# ── Series ──
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'], name='values')
s = pd.Series({'x': 1, 'y': 2, 'z': 3})
s = pd.Series(np.arange(5), dtype='float32')

# ── DataFrame from dict ──
df = pd.DataFrame({
    'name':  ['Alice', 'Bob', 'Charlie'],
    'age':   [25, 30, 35],
    'city':  ['NYC', 'LA', 'Chicago'],
})

# ── DataFrame from list of dicts ──
df = pd.DataFrame([
    {'name': 'Alice', 'age': 25},
    {'name': 'Bob', 'age': 30},
])

# ── DataFrame from NumPy array ──
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'], index=['w', 'x', 'y', 'z'])

# ── From list of tuples ──
data = [('Alice', 25, 'NYC'), ('Bob', 30, 'LA')]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])

# ── Read from files ──
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
df = pd.read_json('data.json')
df = pd.read_sql('SELECT * FROM table', con)
df = pd.read_parquet('data.parquet')
df = pd.read_clipboard()       # from clipboard
df = pd.read_html(url)[0]      # from HTML tables

creation_constructors.py

# ── Ranges & sequences ──
dates = pd.date_range('2024-01-01', periods=6, freq='D')
df = pd.DataFrame({'date': dates, 'value': np.arange(6)})

# ── MultiIndex DataFrame ──
index = pd.MultiIndex.from_tuples(
    [('US', 'CA'), ('US', 'NY'), ('UK', 'LN'), ('UK', 'MN')],
    names=['country', 'city']
)
df = pd.DataFrame({'population': [39, 19, 9, 3]}, index=index)

# ── Categorical ──
df = pd.DataFrame({
    'grade': pd.Categorical(['A', 'B', 'A', 'C', 'B'], 
                            categories=['A', 'B', 'C', 'D'], 
                            ordered=True),
})

# ── Timedelta ──
df = pd.DataFrame({
    'task': ['build', 'test', 'deploy'],
    'duration': pd.to_timedelta(['2h 30m', '1h 15m', '45m']),
})

# ── Quick exploratory DataFrames ──
df_test = pd.DataFrame(np.random.rand(100, 5))   # 100×5 random floats
df_zero = pd.DataFrame(np.zeros((3, 4)), columns=list('ABCD'))

# ── Convert between structures ──
s.to_frame()                    # Series → DataFrame
df.to_dict('records')           # → list of dicts
df.to_dict('list')              # → dict of lists
df.to_numpy()                   # → np.ndarray
df.values                       # → np.ndarray (same)
df.to_dict('index')             # → {index: {col: val}}
df.to_dict('split')             # → {'columns':.., 'data':.., 'index':..}

Import Convention

pdimport pandas as pd

npimport numpy as np

Display optionspd.set_option('display.max_columns', 50)

Float formatpd.set_option('display.float_format', '{:.2f}'.format)

Data Input Sources

Function	Format	Key Param
read_csv()	CSV, TSV	sep, header, names, dtype
read_excel()	XLSX, XLS	sheet_name, engine
read_json()	JSON	orient, lines
read_parquet()	Parquet	columns, filters
read_sql()	SQL query	con, params, chunksize
read_html()	HTML table	match, flavor
read_clipboard()	Clipboard	sep
read_fwf()	Fixed-width	colspecs, widths

💡

Performance tip: When creating DataFrames from large datasets, specify dtype explicitly in read_csv() to avoid memory-heavy type inference. Use pd.read_csv(..., dtype=dict(id='int32', price='float32')).

🧹

Data Cleaning

ESSENTIAL

missing_values.py

# ── Detect missing values ──
df.isna()                       # boolean mask of NaN/None
df.isnull()                     # alias for isna()
df.notna()                      # inverse
df.isna().sum()                 # count NaN per column
df.isna().sum().sum()           # total NaN count
df[df.isna().any(axis=1)]       # rows with any NaN

# ── Drop missing ──
df.dropna()                     # drop rows with ANY NaN
df.dropna(how='all')            # drop rows where ALL values NaN
df.dropna(subset=['col1', 'col2'])  # only check these columns
df.dropna(thresh=3)             # keep rows with ≥3 non-NaN values
df.dropna(axis=1)               # drop columns with NaN

# ── Fill missing ──
df.fillna(0)                    # fill all NaN with 0
df.fillna({'col1': 0, 'col2': 'unknown'})
df['col'].fillna(df['col'].mean())         # fill with mean
df['col'].fillna(df['col'].median())       # fill with median
df['col'].fillna(df['col'].mode()[0])      # fill with mode
df.fillna(method='ffill')      # forward fill (deprecated → ffill())
df.ffill()                      # forward fill
df.bfill()                      # backward fill
df.interpolate(method='linear')  # linear interpolation

cleaning_transforms.py

# ── Deduplication ──
df.duplicated()                 # boolean mask of duplicates
df.duplicated(subset=['col1'])  # check only specific columns
df.drop_duplicates()            # remove duplicate rows
df.drop_duplicates(subset=['col1'], keep='last')
df.drop_duplicates(keep=False)  # drop ALL duplicates

# ── Rename columns ──
df.rename(columns={'old': 'new'})
df.rename(columns=str.lower)    # lowercase all columns
df.columns = ['a', 'b', 'c']    # assign new column names
df.columns = df.columns.str.strip()  # strip whitespace

# ── Replace values ──
df.replace('N/A', np.nan)
df.replace({'col1': {0: np.nan, -1: np.nan}})
df.replace([r'^\s+$'], np.nan, regex=True)  # blank strings → NaN

# ── String cleaning ──
df['col'] = df['col'].str.strip()             # trim whitespace
df['col'] = df['col'].str.lower()             # lowercase
df['col'] = df['col'].str.upper()             # uppercase
df['col'] = df['col'].str.title()             # title case
df['col'] = df['col'].str.replace(',', '')    # remove commas
df['col'] = df['col'].str.replace(r'\D', '', regex=True)  # keep digits only
df['col'] = df['col'].str.split(',').str[0]   # split and take first
df['col'] = df['col'].str.extract(r'(\d+)')   # regex extract

# ── Type conversion ──
df['col'] = df['col'].astype(int)
df['col'] = df['col'].astype('float32')
df['col'] = pd.to_numeric(df['col'], errors='coerce')  # invalid → NaN
df['col'] = pd.to_datetime(df['col'])
df['col'] = df['col'].astype('category')

outliers_handling.py

# ── Outlier detection with IQR ──
Q1 = df['col'].quantile(0.25)
Q3 = df['col'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df['col'] < lower) | (df['col'] > upper)]

# ── Clip outliers ──
df['col'] = df['col'].clip(lower, upper)

# ── Z-score method ──
from scipy import stats
z_scores = np.abs(stats.zscore(df['col']))
outliers = df[z_scores > 3]

# ── Apply custom cleaning function ──
def clean_value(val):
    if pd.isna(val):
        return 0
    if isinstance(val, str):
        return float(val.replace('$', '').replace(',', ''))
    return val

df['col'] = df['col'].apply(clean_value)

NaN Handling Strategies

Strategy	Method	Best For
Drop rows	dropna()	Few missing rows (<5%)
Fill constant	fillna(val)	Known default values
Fill mean/med	fillna(df.mean())	Numeric, normal dist
Forward fill	ffill()	Time series data
Interpolate	interpolate()	Sequential numeric data
Drop columns	dropna(axis=1)	Columns >50% missing
Coerce	to_numeric(coerce)	Mixed-type columns
Custom fill	fillna(func)	Domain-specific logic

Data Quality Checklist

df.shapeCheck dimensions (rows, cols)

df.dtypesVerify column types

df.describe()Statistical summary

df.info()Memory usage + non-null counts

df.nunique()Unique values per column

df.value_counts()Frequency of each value

df.duplicated().sum()Count duplicate rows

df.memory_usage(deep=True)Memory per column

⚠️

Warning: Using fillna() before groupby() operations can introduce bias. Always understand why data is missing (MCAR, MAR, MNAR) before choosing an imputation strategy.

🔍

Filtering & Selection

CORE

row_column_selection.py

# ── Column selection ──
df['col']                       # single column → Series
df[['col1', 'col2']]            # multiple columns → DataFrame
df.loc[:, 'A':'C']              # columns A through C (inclusive)
df.iloc[:, [0, 2, 4]]           # columns by integer position
df.filter(regex='^price_')      # regex match on column names
df.filter(like='total')         # substring match on column names
df.select_dtypes(include='number')  # only numeric columns
df.select_dtypes(include=['int64', 'float64'])
df.select_dtypes(exclude='object')

# ── Row selection by label ──
df.loc[0]                       # single row by label
df.loc[2:5]                     # rows 2–5 inclusive
df.loc[df.index[0]]             # first row
df.loc[['row1', 'row3', 'row5']]

# ── Row selection by position ──
df.iloc[0]                      # first row
df.iloc[-1]                     # last row
df.iloc[2:5]                    # rows 2, 3, 4 (exclusive end)
df.iloc[[0, 3, 7]]              # specific rows
df.iloc[::2]                    # every other row

# ── Combined selection ──
df.loc[2:5, ['name', 'age']]    # rows 2-5, specific columns
df.iloc[0:3, 0:2]               # first 3 rows, first 2 cols
df.at[0, 'col']                 # scalar value (fast)
df.iat[0, 2]                    # scalar by position (fast)

conditional_filtering.py

# ── Boolean indexing ──
df[df['age'] > 25]                              # single condition
df[(df['age'] > 25) & (df['city'] == 'NYC')]    # AND
df[(df['age'] > 25) | (df['city'] == 'NYC')]    # OR
df[~(df['age'] > 25)]                           # NOT
df[df['city'].isin(['NYC', 'LA', 'Chicago'])]   # membership
df[df['name'].str.contains('Al')]               # string contains
df[df['name'].str.startswith('A')]
df[df['col'].between(10, 50)]                   # range
df[df['col'].isna()]                            # is NaN
df[df['col'].notna()]                           # not NaN

# ── query() — SQL-like syntax ──
df.query('age > 25 and city == "NYC"')
df.query('age > @threshold')        # reference variable
df.query('A > B')                   # compare columns
df.query('not (A > 5 or B < 2)')

# ── nlargest / nsmallest ──
df.nlargest(5, 'salary')            # top 5 by column
df.nsmallest(3, 'price', keep='last')

# ── Sample ──
df.sample(n=5)                      # random 5 rows
df.sample(frac=0.1)                 # random 10%
df.sample(n=3, random_state=42)     # reproducible

# ── Where / mask ──
df.where(df['age'] > 25)            # keep non-matching as NaN
df.mask(df['age'] < 18)             # NaN where condition is True
df.clip(lower=0, upper=100)         # clip values to range

loc vs iloc

Aspect	loc[]	iloc[]
Indexing by	Label / name	Integer position
End slice	Inclusive	Exclusive
Accepts	Labels, boolean arrays	Integers, boolean arrays
Speed	Slightly slower	Faster (positional)
Use case	Named index, conditions	Positional access

Common Patterns

df.loc[cond, col]Filter rows + select columns

df.query(expr)SQL-like string filtering

df.filter(regex=...)Regex column selection

df.isin([...])Set membership test

df.between(a, b)Range filter (inclusive)

df.nlargest(n, col)Top N by column value

💡

Use .query()for complex boolean filters — it's more readable and often faster than chained boolean operations, especially with large DataFrames.

📊

Aggregation & GroupBy

POWER

aggregation.py

# ── Basic aggregation ──
df.sum()
df.mean()
df.median()
df.min()
df.max()
df.count()                      # non-NaN count
df.std()                        # standard deviation
df.var()                        # variance
df.sem()                        # standard error of the mean
df.quantile([0.25, 0.5, 0.75]) # quartiles
df.describe()                   # full summary
df.agg(['sum', 'mean', 'std', 'count'])

# ── Column-specific aggregation ──
df.agg({
    'salary': ['mean', 'median', 'min', 'max'],
    'age': ['mean', 'std'],
    'name': 'count',
})

# ── Cumulative operations ──
df['col'].cumsum()
df['col'].cummax()
df['col'].cummin()
df['col'].cumprod()
df['col'].pct_change()          # percent change
df['col'].rank()                # rank values

groupby.py

# ── GroupBy basics ──
grouped = df.groupby('department')
grouped = df.groupby(['dept', 'team'])       # multi-level
grouped = df.groupby(df.index // 5)          # bin index

# ── Aggregation ──
grouped.size()                               # group sizes
grouped.count()                              # non-NaN per column
grouped.sum()
grouped.mean()
grouped.median()
grouped.std()
grouped.agg(['sum', 'mean', 'count'])
grouped.agg({'salary': ['mean', 'max'], 'age': 'median'})

# ── Named aggregation (clean column names) ──
df.groupby('dept').agg(
    avg_salary=('salary', 'mean'),
    max_salary=('salary', 'max'),
    headcount=('name', 'count'),
)

# ── Transform (same shape as original) ──
df['salary_zscore'] = df.groupby('dept')['salary'].transform(
    lambda x: (x - x.mean()) / x.std()
)
df['rank_in_dept'] = df.groupby('dept')['salary'].rank('dense')

# ── Filter groups ──
df.groupby('dept').filter(lambda x: len(x) >= 5)  # keep groups with ≥5 rows

# ── Apply custom functions ──
def custom(group):
    return pd.Series({
        'range': group['salary'].max() - group['salary'].min(),
        'iqr': group['salary'].quantile(0.75) - group['salary'].quantile(0.25),
    })
df.groupby('dept').apply(custom)

pivot_melt.py

# ── Pivot table ──
df.pivot_table(
    values='salary',
    index='department',
    columns='experience_level',
    aggfunc='mean',
    fill_value=0,
    margins=True,          # add row/col totals
    margins_name='Total',
    observed=True,         # only observed categories
)

# ── Multiple aggfuncs in pivot ──
df.pivot_table(
    values='salary',
    index='dept',
    columns='level',
    aggfunc=['mean', 'count', 'std'],
)

# ── Melt (unpivot) ──
pd.melt(
    df,
    id_vars=['name', 'dept'],   # columns to keep
    value_vars=['Q1', 'Q2', 'Q3'],  # columns to unpivot
    var_name='quarter',
    value_name='revenue',
)

# ── Crosstab ──
pd.crosstab(df['dept'], df['level'], margins=True)
pd.crosstab(df['dept'], df['level'], values=df['salary'], aggfunc='mean')

# ── Stack / Unstack ──
df.set_index(['dept', 'team'])['salary'].unstack(fill_value=0)
df.stack()                    # pivot columns → rows
df.unstack()                  # pivot rows → columns

GroupBy Split-Apply-Combine

Method	Returns	Use Case
aggregate()	Reduced DF	One value per group
transform()	Same shape	Normalize, fill per group
apply()	Flexible	Custom group operations
filter()	Filtered DF	Drop groups by condition
size()	Series	Count per group
get_group()	DataFrame	Extract one group

Reshape Comparison

Function	Direction	When
pivot()	long → wide	Unique index+columns
pivot_table()	long → wide	Needs aggregation
melt()	wide → long	Unpivot columns
stack()	cols → rows	Innermost level
unstack()	rows → cols	Innermost index
crosstab()	2 cols → table	Frequency / agg table

💡

Use named aggregation with .agg(avg_x=('x', 'mean'), ...) instead of chained rename operations. It produces clean, readable column names in one step.

🔗

Merge, Join & Concat

COMBINE

merge_join.py

# ── Merge (SQL-style join) ──
df_merged = pd.merge(
    left=df1,
    right=df2,
    on='key',                  # join column (both sides)
    how='inner',               # inner, outer, left, right
    suffixes=('_left', '_right'),
    indicator=True,            # adds _merge column
    validate='one_to_one',     # 1:1, 1:m, m:1, m:m
)

# ── Different key names ──
pd.merge(df1, df2, left_on='id', right_on='user_id')

# ── Multiple keys ──
pd.merge(df1, df2, on=['dept', 'team'])

# ── Join types ──
pd.merge(df1, df2, how='inner')   # matching rows only
pd.merge(df1, df2, how='outer')   # all rows (NaN for missing)
pd.merge(df1, df2, how='left')    # all from left
pd.merge(df1, df2, how='right')   # all from right
pd.merge(df1, df2, how='cross')   # cartesian product

# ── Index merge ──
df1.join(df2, on='key', how='left', lsuffix='_l', rsuffix='_r')
df1.join(df2, how='outer')        # both indices

# ── Merge on index ──
pd.merge(df1, df2, left_index=True, right_index=True)
pd.merge(df1, df2, left_on='key', right_index=True)

concat_combine.py

# ── Concatenate (stack vertically / horizontally) ──
pd.concat([df1, df2], axis=0)          # stack rows (vertical)
pd.concat([df1, df2], axis=1)          # stack columns (horizontal)
pd.concat([df1, df2], ignore_index=True)   # reset index

# ── Concat with keys ──
pd.concat([df1, df2, df3], keys=['Q1', 'Q2', 'Q3'])
# creates MultiIndex: (Q1, 0), (Q1, 1), (Q2, 0), ...

# ── Concat with inner join on columns ──
pd.concat([df1, df2], axis=0, join='inner')    # only common columns
pd.concat([df1, df2], axis=0, join='outer')    # all columns (default)

# ── Combine first (fill missing with another) ──
df1.combine_first(df2)    # prefer df1, fill NaN from df2

# ── Update (in-place fill) ──
df1.update(df2)           # fill NaN in df1 with df2 values

# ── Append rows (convenience, concat under the hood) ──
df.loc[len(df)] = {'name': 'Alice', 'age': 25}  # single row
df = pd.concat([df, new_row_df], ignore_index=True)

# ── Compare DataFrames ──
df1.compare(df2)          # show differences

Join Types Comparison

Type	Left Rows	Right Rows	SQL Equivalent
inner	Matched	Matched	INNER JOIN
left	All	Matched	LEFT JOIN
right	Matched	All	RIGHT JOIN
outer	All	All	FULL OUTER JOIN
cross	All × All	All × All	CROSS JOIN

Merge vs Join vs Concat

Method	Best For	Key Param
merge()	SQL-style join	on, how
join()	Index-based join	on, how
concat()	Stack/combine	axis, keys
combine_first()	Fill missing	N/A
append()	Add rows (deprecated)	N/A

🚫

Common pitfall: After merge(), check for duplicate columns with suffixes. Use validate='one_to_many' to catch unexpected duplicates early.

📅

Time Series

TEMPORAL

datetime_basics.py

# ── Create datetime objects ──
pd.to_datetime('2024-01-15')
pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01'])
pd.to_datetime(df['date_str'], format='%Y-%m-%d')
pd.to_datetime(df['timestamp'], unit='s')   # unix epoch

# ── Date ranges ──
pd.date_range('2024-01-01', periods=10, freq='D')
pd.date_range('2024-01-01', '2024-12-31', freq='MS')  # month start
pd.date_range('2024-01-01', periods=12, freq='ME')    # month end
pd.bdate_range('2024-01-01', periods=10)              # business days

# ── Frequency aliases ──
# D=daily, W=weekly, ME=month end, MS=month start
# YE=year end, YS=year start, h=hourly, min=minutely
# s=secondly, B=business day, BH=business hour
# 2D=every 2 days, 2h=every 2 hours, 15min=every 15 min

# ── Set datetime index ──
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df = df.sort_index()

# ── Access datetime properties ──
df.index.year
df.index.month
df.index.day
df.index.day_name()       # 'Monday', 'Tuesday', ...
df.index.dayofweek         # 0=Mon, 6=Sun
df.index.quarter
df.index.is_month_start
df.index.is_month_end
df.index.is_quarter_end

time_series_ops.py

# ── Resampling (change frequency) ──
# Downsample (reduce frequency)
df.resample('W').sum()            # weekly sum
df.resample('ME').mean()          # monthly mean
df.resample('QE').max()           # quarterly max
df.resample('YE').std()           # yearly std

# Resample with custom agg
df.resample('ME').agg({
    'price': ['mean', 'max', 'min'],
    'volume': 'sum',
})

# Upsample (increase frequency)
df.resample('D').asfreq()              # fill with NaN
df.resample('D').ffill()               # forward fill
df.resample('D').interpolate()         # interpolate

# ── Rolling windows ──
df['price'].rolling(window=7).mean()           # 7-period MA
df['price'].rolling(window=30).std()           # 30-period std
df['price'].rolling(window=7, center=True).mean()
df['price'].rolling(window=7, min_periods=3).mean()  # need ≥3 points

# ── Exponentially weighted ──
df['price'].ewm(span=7, adjust=False).mean()  # EMA

# ── Shifting ──
df['price'].shift(1)              # shift down by 1 (lag)
df['price'].shift(-1)             # shift up by 1 (lead)
df['pct_change'] = df['price'].pct_change()
df['diff'] = df['price'].diff()

# ── Timezone handling ──
df.index = df.index.tz_localize('UTC')
df.index = df.index.tz_convert('America/New_York')
df.index.tz                     # show timezone
df.index.tz_convert(None)       # remove timezone

# ── Date arithmetic ──
pd.Timestamp('2024-01-01') + pd.Timedelta(days=30)
pd.Timestamp('2024-01-01') - pd.Timestamp('2023-06-15')
df['date'] + pd.DateOffset(months=3)

# ── Between time ──
df.between_time('09:30', '16:00')    # keep hours between
df.at_time('09:30')                  # exact time

Frequency Aliases

Alias	Description	Example
D	Calendar day	freq="D"
B	Business day	freq="B"
W-MON	Week ending Monday	freq="W-MON"
MS	Month start	freq="MS"
ME	Month end	freq="ME"
QE	Quarter end	freq="QE"
YE	Year end	freq="YE"
h	Hourly	freq="h"
15min	15 minutes	freq="15min"

Window Functions

Function	Window Type	Use Case
rolling(w)	Fixed window	Moving average, Bollinger
expanding()	Expanding window	Cumulative mean
ewm(span)	Exponential decay	Smoothed trends
shift(n)	Time shift	Lag/lead features
diff()	Difference	Stationarity, returns
pct_change()	Percent change	Growth rates

⚠️

Always set a DatetimeIndex before time series operations. Use df.set_index('date').sort_index() to enable .resample(), .rolling(), and .asfreq().

📈

Visualization

PLOTS

basic_plots.py

import matplotlib.pyplot as plt

# ── Line plot ──
df.plot(kind='line', x='date', y='price', figsize=(12, 5))
df['price'].plot(title='Price Over Time', grid=True, color='blue')

# ── Bar plot ──
df['category'].value_counts().plot(kind='bar')
df.plot(kind='bar', x='name', y=['salary', 'bonus'], stacked=True)
df.plot(kind='barh', x='name', y='salary')     # horizontal

# ── Histogram ──
df['age'].plot(kind='hist', bins=20, edgecolor='white', alpha=0.8)
df['age'].plot(kind='hist', bins=20, density=True)  # normalized

# ── Scatter ──
df.plot(kind='scatter', x='height', y='weight', s=df['age'], alpha=0.5)

# ── Box plot ──
df.plot(kind='box', column=['salary', 'bonus'], by='department')

# ── Area plot ──
df[['sales_A', 'sales_B', 'sales_C']].plot(kind='area', stacked=True)

# ── Pie chart ──
df['category'].value_counts().plot(kind='pie', autopct='%1.1f%%')

advanced_plots.py

# ── Subplots ──
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
df['col1'].plot(ax=axes[0, 0], kind='hist')
df['col2'].plot(ax=axes[0, 1], kind='box')
df['col3'].plot(ax=axes[1, 0], kind='line')
df.plot(ax=axes[1, 1], kind='scatter', x='col1', y='col2')

# ── Styling ──
plt.style.use('seaborn-v0_8-darkgrid')
df.plot(title='Title', xlabel='X Label', ylabel='Y Label',
        figsize=(12, 6), fontsize=12, color='teal', linewidth=2)

# ── Save figure ──
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.savefig('plot.pdf', format='pdf')

# ── Correlation heatmap ──
corr = df.select_dtypes(include='number').corr()
import seaborn as sns
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)

# ── Pairplot ──
sns.pairplot(df, hue='category', diag_kind='kde')

# ── Time series specific ──
df['price'].plot(kind='line', style='.-', alpha=0.7)
df['price'].rolling(30).mean().plot(label='30-day MA', linewidth=3)
df['price'].rolling(30).std().plot(label='30-day STD', secondary_y=True)
plt.legend()
plt.title('Price with Moving Average')

Plot Types

kind	Plot	Use For
line	Line chart	Trends, time series
bar / barh	Bar chart	Categories comparison
hist	Histogram	Distribution of values
box	Box plot	Outliers, quartiles
scatter	Scatter	Relationship between vars
area	Area chart	Cumulative / composition
pie	Pie chart	Proportions
kde	Density plot	Smooth distribution
hexbin	Hexagonal bin	Large scatter datasets

Common Style Options

figsize(width, height) in inches

titlePlot title string

colorLine/point color

alphaTransparency (0-1)

gridShow gridlines

fontsizeText size in points

logx/logyLog scale axes

secondary_yDual y-axis

💡

Pandas + Seaborn combo: Use df.plot() for quick exploratory plots, then switch to sns for publication-quality figures with better defaults and statistical overlays.

🚀

Performance Tips

SPEED

memory_optimization.py

# ── Check memory usage ──
df.info(memory_usage='deep')
df.memory_usage(deep=True)
df.memory_usage(deep=True).sum() / (1024**2)  # MB

# ── Downcast numeric types ──
df['int_col'] = df['int_col'].astype('int32')    # int64 → int32 (50% savings)
df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')

# ── Use category for low-cardinality strings ──
df['city'] = df['city'].astype('category')
# Best when unique values << total rows

# ── Use nullable dtypes (pandas 2.0+) ──
df['col'] = df['col'].astype('Int32')      # nullable int (supports NaN)
df['col'] = df['col'].astype('Float64')    # nullable float
df['col'] = df['col'].astype('string')     # nullable string

# ── Sparse data ──
df['sparse_col'] = pd.arrays.SparseArray(df['col'].values)

# ── Optimal dtypes function ──
def optimize_dtypes(df):
    for col in df.select_dtypes(include='int64').columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    for col in df.select_dtypes(include='float64').columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype('category')
    return df

speed_optimization.py

# ── Vectorize with .apply vs vectorized ──
# SLOW: row-by-row apply
df.apply(lambda row: row['A'] + row['B'], axis=1)

# FAST: vectorized operation
df['A'] + df['B']

# ── Use str accessor for string ops ──
df['col'].str.upper()            # vectorized
# vs: df['col'].apply(str.upper)  # slower

# ── Use .eval() for complex expressions ──
df.eval('result = (A + B) * C / D')
df.query('A > B and C < D')

# ── itertuples (faster than iterrows) ──
for row in df.itertuples(index=False):
    print(row.name, row.age)

# ── Use .loc for assignment ──
df.loc[mask, 'col'] = new_value  # correct way

# ── Chunked reading for large files ──
chunks = pd.read_csv('huge.csv', chunksize=100_000)
result = pd.concat([chunk.groupby('dept').sum() for chunk in chunks])

# ── Parquet for fast I/O ──
df.to_parquet('data.parquet', engine='pyarrow', compression='snappy')
df = pd.read_parquet('data.parquet', columns=['col1', 'col2'])

# ── Copy semantics ──
df2 = df.copy()                  # true copy
df2 = df.copy(deep=False)        # shallow copy (shared data blocks)

# ── Disable copy-on-write (pandas 2.0) ──
pd.set_option('mode.copy_on_write', True)

Type Memory Comparison (per value)

Type	Bytes	Range
int8	1	-128 to 127
int16	2	-32K to 32K
int32	4	-2.1B to 2.1B
int64	8	-9.2E18 to 9.2E18
float16	2	Half precision
float32	4	Single precision
float64	8	Double precision
category	~0.5-4	N unique values
bool	1	True/False

Speed Hierarchy (Fast→Slow)

Method	Speed	Notes
Vectorized op	★★★★★	df["A"] + df["B"]
.str / .dt accessor	★★★★	Built-in vectorized
.eval() / .query()	★★★★	Expression engine
.apply() axis=0	★★★	Column-wise function
list comprehension	★★★	Often faster than apply
.itertuples()	★★	Loop but fast iteration
.iterrows()	★	Slow! Avoid in production

🚫

Golden rule: If you're writing iterrows() or apply(axis=1), there's almost always a vectorized alternative. Always reach for column-wise operations, .str/.dt accessors, or numpy ufuncs first.

⏳

Loading cheatsheet...