⏳
Loading cheatsheet...
DataFrames, indexing, groupby, joins, cleaning and time-series operations for data analysis.
import pandas as pd
import numpy as np
# ── Series ──
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'], name='values')
s = pd.Series({'x': 1, 'y': 2, 'z': 3})
s = pd.Series(np.arange(5), dtype='float32')
# ── DataFrame from dict ──
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['NYC', 'LA', 'Chicago'],
})
# ── DataFrame from list of dicts ──
df = pd.DataFrame([
{'name': 'Alice', 'age': 25},
{'name': 'Bob', 'age': 30},
])
# ── DataFrame from NumPy array ──
arr = np.random.randn(4, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'], index=['w', 'x', 'y', 'z'])
# ── From list of tuples ──
data = [('Alice', 25, 'NYC'), ('Bob', 30, 'LA')]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])
# ── Read from files ──
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
df = pd.read_json('data.json')
df = pd.read_sql('SELECT * FROM table', con)
df = pd.read_parquet('data.parquet')
df = pd.read_clipboard() # from clipboard
df = pd.read_html(url)[0] # from HTML tables# ── Ranges & sequences ──
dates = pd.date_range('2024-01-01', periods=6, freq='D')
df = pd.DataFrame({'date': dates, 'value': np.arange(6)})
# ── MultiIndex DataFrame ──
index = pd.MultiIndex.from_tuples(
[('US', 'CA'), ('US', 'NY'), ('UK', 'LN'), ('UK', 'MN')],
names=['country', 'city']
)
df = pd.DataFrame({'population': [39, 19, 9, 3]}, index=index)
# ── Categorical ──
df = pd.DataFrame({
'grade': pd.Categorical(['A', 'B', 'A', 'C', 'B'],
categories=['A', 'B', 'C', 'D'],
ordered=True),
})
# ── Timedelta ──
df = pd.DataFrame({
'task': ['build', 'test', 'deploy'],
'duration': pd.to_timedelta(['2h 30m', '1h 15m', '45m']),
})
# ── Quick exploratory DataFrames ──
df_test = pd.DataFrame(np.random.rand(100, 5)) # 100×5 random floats
df_zero = pd.DataFrame(np.zeros((3, 4)), columns=list('ABCD'))
# ── Convert between structures ──
s.to_frame() # Series → DataFrame
df.to_dict('records') # → list of dicts
df.to_dict('list') # → dict of lists
df.to_numpy() # → np.ndarray
df.values # → np.ndarray (same)
df.to_dict('index') # → {index: {col: val}}
df.to_dict('split') # → {'columns':.., 'data':.., 'index':..}| Function | Format | Key Param |
|---|---|---|
| read_csv() | CSV, TSV | sep, header, names, dtype |
| read_excel() | XLSX, XLS | sheet_name, engine |
| read_json() | JSON | orient, lines |
| read_parquet() | Parquet | columns, filters |
| read_sql() | SQL query | con, params, chunksize |
| read_html() | HTML table | match, flavor |
| read_clipboard() | Clipboard | sep |
| read_fwf() | Fixed-width | colspecs, widths |
dtype explicitly in read_csv() to avoid memory-heavy type inference. Use pd.read_csv(..., dtype=dict(id='int32', price='float32')).# ── Detect missing values ──
df.isna() # boolean mask of NaN/None
df.isnull() # alias for isna()
df.notna() # inverse
df.isna().sum() # count NaN per column
df.isna().sum().sum() # total NaN count
df[df.isna().any(axis=1)] # rows with any NaN
# ── Drop missing ──
df.dropna() # drop rows with ANY NaN
df.dropna(how='all') # drop rows where ALL values NaN
df.dropna(subset=['col1', 'col2']) # only check these columns
df.dropna(thresh=3) # keep rows with ≥3 non-NaN values
df.dropna(axis=1) # drop columns with NaN
# ── Fill missing ──
df.fillna(0) # fill all NaN with 0
df.fillna({'col1': 0, 'col2': 'unknown'})
df['col'].fillna(df['col'].mean()) # fill with mean
df['col'].fillna(df['col'].median()) # fill with median
df['col'].fillna(df['col'].mode()[0]) # fill with mode
df.fillna(method='ffill') # forward fill (deprecated → ffill())
df.ffill() # forward fill
df.bfill() # backward fill
df.interpolate(method='linear') # linear interpolation# ── Deduplication ──
df.duplicated() # boolean mask of duplicates
df.duplicated(subset=['col1']) # check only specific columns
df.drop_duplicates() # remove duplicate rows
df.drop_duplicates(subset=['col1'], keep='last')
df.drop_duplicates(keep=False) # drop ALL duplicates
# ── Rename columns ──
df.rename(columns={'old': 'new'})
df.rename(columns=str.lower) # lowercase all columns
df.columns = ['a', 'b', 'c'] # assign new column names
df.columns = df.columns.str.strip() # strip whitespace
# ── Replace values ──
df.replace('N/A', np.nan)
df.replace({'col1': {0: np.nan, -1: np.nan}})
df.replace([r'^\s+$'], np.nan, regex=True) # blank strings → NaN
# ── String cleaning ──
df['col'] = df['col'].str.strip() # trim whitespace
df['col'] = df['col'].str.lower() # lowercase
df['col'] = df['col'].str.upper() # uppercase
df['col'] = df['col'].str.title() # title case
df['col'] = df['col'].str.replace(',', '') # remove commas
df['col'] = df['col'].str.replace(r'\D', '', regex=True) # keep digits only
df['col'] = df['col'].str.split(',').str[0] # split and take first
df['col'] = df['col'].str.extract(r'(\d+)') # regex extract
# ── Type conversion ──
df['col'] = df['col'].astype(int)
df['col'] = df['col'].astype('float32')
df['col'] = pd.to_numeric(df['col'], errors='coerce') # invalid → NaN
df['col'] = pd.to_datetime(df['col'])
df['col'] = df['col'].astype('category')# ── Outlier detection with IQR ──
Q1 = df['col'].quantile(0.25)
Q3 = df['col'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df['col'] < lower) | (df['col'] > upper)]
# ── Clip outliers ──
df['col'] = df['col'].clip(lower, upper)
# ── Z-score method ──
from scipy import stats
z_scores = np.abs(stats.zscore(df['col']))
outliers = df[z_scores > 3]
# ── Apply custom cleaning function ──
def clean_value(val):
if pd.isna(val):
return 0
if isinstance(val, str):
return float(val.replace('$', '').replace(',', ''))
return val
df['col'] = df['col'].apply(clean_value)| Strategy | Method | Best For |
|---|---|---|
| Drop rows | dropna() | Few missing rows (<5%) |
| Fill constant | fillna(val) | Known default values |
| Fill mean/med | fillna(df.mean()) | Numeric, normal dist |
| Forward fill | ffill() | Time series data |
| Interpolate | interpolate() | Sequential numeric data |
| Drop columns | dropna(axis=1) | Columns >50% missing |
| Coerce | to_numeric(coerce) | Mixed-type columns |
| Custom fill | fillna(func) | Domain-specific logic |
fillna() before groupby() operations can introduce bias. Always understand why data is missing (MCAR, MAR, MNAR) before choosing an imputation strategy.# ── Column selection ──
df['col'] # single column → Series
df[['col1', 'col2']] # multiple columns → DataFrame
df.loc[:, 'A':'C'] # columns A through C (inclusive)
df.iloc[:, [0, 2, 4]] # columns by integer position
df.filter(regex='^price_') # regex match on column names
df.filter(like='total') # substring match on column names
df.select_dtypes(include='number') # only numeric columns
df.select_dtypes(include=['int64', 'float64'])
df.select_dtypes(exclude='object')
# ── Row selection by label ──
df.loc[0] # single row by label
df.loc[2:5] # rows 2–5 inclusive
df.loc[df.index[0]] # first row
df.loc[['row1', 'row3', 'row5']]
# ── Row selection by position ──
df.iloc[0] # first row
df.iloc[-1] # last row
df.iloc[2:5] # rows 2, 3, 4 (exclusive end)
df.iloc[[0, 3, 7]] # specific rows
df.iloc[::2] # every other row
# ── Combined selection ──
df.loc[2:5, ['name', 'age']] # rows 2-5, specific columns
df.iloc[0:3, 0:2] # first 3 rows, first 2 cols
df.at[0, 'col'] # scalar value (fast)
df.iat[0, 2] # scalar by position (fast)# ── Boolean indexing ──
df[df['age'] > 25] # single condition
df[(df['age'] > 25) & (df['city'] == 'NYC')] # AND
df[(df['age'] > 25) | (df['city'] == 'NYC')] # OR
df[~(df['age'] > 25)] # NOT
df[df['city'].isin(['NYC', 'LA', 'Chicago'])] # membership
df[df['name'].str.contains('Al')] # string contains
df[df['name'].str.startswith('A')]
df[df['col'].between(10, 50)] # range
df[df['col'].isna()] # is NaN
df[df['col'].notna()] # not NaN
# ── query() — SQL-like syntax ──
df.query('age > 25 and city == "NYC"')
df.query('age > @threshold') # reference variable
df.query('A > B') # compare columns
df.query('not (A > 5 or B < 2)')
# ── nlargest / nsmallest ──
df.nlargest(5, 'salary') # top 5 by column
df.nsmallest(3, 'price', keep='last')
# ── Sample ──
df.sample(n=5) # random 5 rows
df.sample(frac=0.1) # random 10%
df.sample(n=3, random_state=42) # reproducible
# ── Where / mask ──
df.where(df['age'] > 25) # keep non-matching as NaN
df.mask(df['age'] < 18) # NaN where condition is True
df.clip(lower=0, upper=100) # clip values to range| Aspect | loc[] | iloc[] |
|---|---|---|
| Indexing by | Label / name | Integer position |
| End slice | Inclusive | Exclusive |
| Accepts | Labels, boolean arrays | Integers, boolean arrays |
| Speed | Slightly slower | Faster (positional) |
| Use case | Named index, conditions | Positional access |
.query()for complex boolean filters — it's more readable and often faster than chained boolean operations, especially with large DataFrames.# ── Basic aggregation ──
df.sum()
df.mean()
df.median()
df.min()
df.max()
df.count() # non-NaN count
df.std() # standard deviation
df.var() # variance
df.sem() # standard error of the mean
df.quantile([0.25, 0.5, 0.75]) # quartiles
df.describe() # full summary
df.agg(['sum', 'mean', 'std', 'count'])
# ── Column-specific aggregation ──
df.agg({
'salary': ['mean', 'median', 'min', 'max'],
'age': ['mean', 'std'],
'name': 'count',
})
# ── Cumulative operations ──
df['col'].cumsum()
df['col'].cummax()
df['col'].cummin()
df['col'].cumprod()
df['col'].pct_change() # percent change
df['col'].rank() # rank values# ── GroupBy basics ──
grouped = df.groupby('department')
grouped = df.groupby(['dept', 'team']) # multi-level
grouped = df.groupby(df.index // 5) # bin index
# ── Aggregation ──
grouped.size() # group sizes
grouped.count() # non-NaN per column
grouped.sum()
grouped.mean()
grouped.median()
grouped.std()
grouped.agg(['sum', 'mean', 'count'])
grouped.agg({'salary': ['mean', 'max'], 'age': 'median'})
# ── Named aggregation (clean column names) ──
df.groupby('dept').agg(
avg_salary=('salary', 'mean'),
max_salary=('salary', 'max'),
headcount=('name', 'count'),
)
# ── Transform (same shape as original) ──
df['salary_zscore'] = df.groupby('dept')['salary'].transform(
lambda x: (x - x.mean()) / x.std()
)
df['rank_in_dept'] = df.groupby('dept')['salary'].rank('dense')
# ── Filter groups ──
df.groupby('dept').filter(lambda x: len(x) >= 5) # keep groups with ≥5 rows
# ── Apply custom functions ──
def custom(group):
return pd.Series({
'range': group['salary'].max() - group['salary'].min(),
'iqr': group['salary'].quantile(0.75) - group['salary'].quantile(0.25),
})
df.groupby('dept').apply(custom)# ── Pivot table ──
df.pivot_table(
values='salary',
index='department',
columns='experience_level',
aggfunc='mean',
fill_value=0,
margins=True, # add row/col totals
margins_name='Total',
observed=True, # only observed categories
)
# ── Multiple aggfuncs in pivot ──
df.pivot_table(
values='salary',
index='dept',
columns='level',
aggfunc=['mean', 'count', 'std'],
)
# ── Melt (unpivot) ──
pd.melt(
df,
id_vars=['name', 'dept'], # columns to keep
value_vars=['Q1', 'Q2', 'Q3'], # columns to unpivot
var_name='quarter',
value_name='revenue',
)
# ── Crosstab ──
pd.crosstab(df['dept'], df['level'], margins=True)
pd.crosstab(df['dept'], df['level'], values=df['salary'], aggfunc='mean')
# ── Stack / Unstack ──
df.set_index(['dept', 'team'])['salary'].unstack(fill_value=0)
df.stack() # pivot columns → rows
df.unstack() # pivot rows → columns| Method | Returns | Use Case |
|---|---|---|
| aggregate() | Reduced DF | One value per group |
| transform() | Same shape | Normalize, fill per group |
| apply() | Flexible | Custom group operations |
| filter() | Filtered DF | Drop groups by condition |
| size() | Series | Count per group |
| get_group() | DataFrame | Extract one group |
| Function | Direction | When |
|---|---|---|
| pivot() | long → wide | Unique index+columns |
| pivot_table() | long → wide | Needs aggregation |
| melt() | wide → long | Unpivot columns |
| stack() | cols → rows | Innermost level |
| unstack() | rows → cols | Innermost index |
| crosstab() | 2 cols → table | Frequency / agg table |
named aggregation with .agg(avg_x=('x', 'mean'), ...) instead of chained rename operations. It produces clean, readable column names in one step.# ── Merge (SQL-style join) ──
df_merged = pd.merge(
left=df1,
right=df2,
on='key', # join column (both sides)
how='inner', # inner, outer, left, right
suffixes=('_left', '_right'),
indicator=True, # adds _merge column
validate='one_to_one', # 1:1, 1:m, m:1, m:m
)
# ── Different key names ──
pd.merge(df1, df2, left_on='id', right_on='user_id')
# ── Multiple keys ──
pd.merge(df1, df2, on=['dept', 'team'])
# ── Join types ──
pd.merge(df1, df2, how='inner') # matching rows only
pd.merge(df1, df2, how='outer') # all rows (NaN for missing)
pd.merge(df1, df2, how='left') # all from left
pd.merge(df1, df2, how='right') # all from right
pd.merge(df1, df2, how='cross') # cartesian product
# ── Index merge ──
df1.join(df2, on='key', how='left', lsuffix='_l', rsuffix='_r')
df1.join(df2, how='outer') # both indices
# ── Merge on index ──
pd.merge(df1, df2, left_index=True, right_index=True)
pd.merge(df1, df2, left_on='key', right_index=True)# ── Concatenate (stack vertically / horizontally) ──
pd.concat([df1, df2], axis=0) # stack rows (vertical)
pd.concat([df1, df2], axis=1) # stack columns (horizontal)
pd.concat([df1, df2], ignore_index=True) # reset index
# ── Concat with keys ──
pd.concat([df1, df2, df3], keys=['Q1', 'Q2', 'Q3'])
# creates MultiIndex: (Q1, 0), (Q1, 1), (Q2, 0), ...
# ── Concat with inner join on columns ──
pd.concat([df1, df2], axis=0, join='inner') # only common columns
pd.concat([df1, df2], axis=0, join='outer') # all columns (default)
# ── Combine first (fill missing with another) ──
df1.combine_first(df2) # prefer df1, fill NaN from df2
# ── Update (in-place fill) ──
df1.update(df2) # fill NaN in df1 with df2 values
# ── Append rows (convenience, concat under the hood) ──
df.loc[len(df)] = {'name': 'Alice', 'age': 25} # single row
df = pd.concat([df, new_row_df], ignore_index=True)
# ── Compare DataFrames ──
df1.compare(df2) # show differences| Type | Left Rows | Right Rows | SQL Equivalent |
|---|---|---|---|
| inner | Matched | Matched | INNER JOIN |
| left | All | Matched | LEFT JOIN |
| right | Matched | All | RIGHT JOIN |
| outer | All | All | FULL OUTER JOIN |
| cross | All × All | All × All | CROSS JOIN |
| Method | Best For | Key Param |
|---|---|---|
| merge() | SQL-style join | on, how |
| join() | Index-based join | on, how |
| concat() | Stack/combine | axis, keys |
| combine_first() | Fill missing | N/A |
| append() | Add rows (deprecated) | N/A |
merge(), check for duplicate columns with suffixes. Use validate='one_to_many' to catch unexpected duplicates early.# ── Create datetime objects ──
pd.to_datetime('2024-01-15')
pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01'])
pd.to_datetime(df['date_str'], format='%Y-%m-%d')
pd.to_datetime(df['timestamp'], unit='s') # unix epoch
# ── Date ranges ──
pd.date_range('2024-01-01', periods=10, freq='D')
pd.date_range('2024-01-01', '2024-12-31', freq='MS') # month start
pd.date_range('2024-01-01', periods=12, freq='ME') # month end
pd.bdate_range('2024-01-01', periods=10) # business days
# ── Frequency aliases ──
# D=daily, W=weekly, ME=month end, MS=month start
# YE=year end, YS=year start, h=hourly, min=minutely
# s=secondly, B=business day, BH=business hour
# 2D=every 2 days, 2h=every 2 hours, 15min=every 15 min
# ── Set datetime index ──
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df = df.sort_index()
# ── Access datetime properties ──
df.index.year
df.index.month
df.index.day
df.index.day_name() # 'Monday', 'Tuesday', ...
df.index.dayofweek # 0=Mon, 6=Sun
df.index.quarter
df.index.is_month_start
df.index.is_month_end
df.index.is_quarter_end# ── Resampling (change frequency) ──
# Downsample (reduce frequency)
df.resample('W').sum() # weekly sum
df.resample('ME').mean() # monthly mean
df.resample('QE').max() # quarterly max
df.resample('YE').std() # yearly std
# Resample with custom agg
df.resample('ME').agg({
'price': ['mean', 'max', 'min'],
'volume': 'sum',
})
# Upsample (increase frequency)
df.resample('D').asfreq() # fill with NaN
df.resample('D').ffill() # forward fill
df.resample('D').interpolate() # interpolate
# ── Rolling windows ──
df['price'].rolling(window=7).mean() # 7-period MA
df['price'].rolling(window=30).std() # 30-period std
df['price'].rolling(window=7, center=True).mean()
df['price'].rolling(window=7, min_periods=3).mean() # need ≥3 points
# ── Exponentially weighted ──
df['price'].ewm(span=7, adjust=False).mean() # EMA
# ── Shifting ──
df['price'].shift(1) # shift down by 1 (lag)
df['price'].shift(-1) # shift up by 1 (lead)
df['pct_change'] = df['price'].pct_change()
df['diff'] = df['price'].diff()
# ── Timezone handling ──
df.index = df.index.tz_localize('UTC')
df.index = df.index.tz_convert('America/New_York')
df.index.tz # show timezone
df.index.tz_convert(None) # remove timezone
# ── Date arithmetic ──
pd.Timestamp('2024-01-01') + pd.Timedelta(days=30)
pd.Timestamp('2024-01-01') - pd.Timestamp('2023-06-15')
df['date'] + pd.DateOffset(months=3)
# ── Between time ──
df.between_time('09:30', '16:00') # keep hours between
df.at_time('09:30') # exact time| Alias | Description | Example |
|---|---|---|
| D | Calendar day | freq="D" |
| B | Business day | freq="B" |
| W-MON | Week ending Monday | freq="W-MON" |
| MS | Month start | freq="MS" |
| ME | Month end | freq="ME" |
| QE | Quarter end | freq="QE" |
| YE | Year end | freq="YE" |
| h | Hourly | freq="h" |
| 15min | 15 minutes | freq="15min" |
| Function | Window Type | Use Case |
|---|---|---|
| rolling(w) | Fixed window | Moving average, Bollinger |
| expanding() | Expanding window | Cumulative mean |
| ewm(span) | Exponential decay | Smoothed trends |
| shift(n) | Time shift | Lag/lead features |
| diff() | Difference | Stationarity, returns |
| pct_change() | Percent change | Growth rates |
df.set_index('date').sort_index() to enable .resample(), .rolling(), and .asfreq().import matplotlib.pyplot as plt
# ── Line plot ──
df.plot(kind='line', x='date', y='price', figsize=(12, 5))
df['price'].plot(title='Price Over Time', grid=True, color='blue')
# ── Bar plot ──
df['category'].value_counts().plot(kind='bar')
df.plot(kind='bar', x='name', y=['salary', 'bonus'], stacked=True)
df.plot(kind='barh', x='name', y='salary') # horizontal
# ── Histogram ──
df['age'].plot(kind='hist', bins=20, edgecolor='white', alpha=0.8)
df['age'].plot(kind='hist', bins=20, density=True) # normalized
# ── Scatter ──
df.plot(kind='scatter', x='height', y='weight', s=df['age'], alpha=0.5)
# ── Box plot ──
df.plot(kind='box', column=['salary', 'bonus'], by='department')
# ── Area plot ──
df[['sales_A', 'sales_B', 'sales_C']].plot(kind='area', stacked=True)
# ── Pie chart ──
df['category'].value_counts().plot(kind='pie', autopct='%1.1f%%')# ── Subplots ──
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
df['col1'].plot(ax=axes[0, 0], kind='hist')
df['col2'].plot(ax=axes[0, 1], kind='box')
df['col3'].plot(ax=axes[1, 0], kind='line')
df.plot(ax=axes[1, 1], kind='scatter', x='col1', y='col2')
# ── Styling ──
plt.style.use('seaborn-v0_8-darkgrid')
df.plot(title='Title', xlabel='X Label', ylabel='Y Label',
figsize=(12, 6), fontsize=12, color='teal', linewidth=2)
# ── Save figure ──
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
plt.savefig('plot.pdf', format='pdf')
# ── Correlation heatmap ──
corr = df.select_dtypes(include='number').corr()
import seaborn as sns
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
# ── Pairplot ──
sns.pairplot(df, hue='category', diag_kind='kde')
# ── Time series specific ──
df['price'].plot(kind='line', style='.-', alpha=0.7)
df['price'].rolling(30).mean().plot(label='30-day MA', linewidth=3)
df['price'].rolling(30).std().plot(label='30-day STD', secondary_y=True)
plt.legend()
plt.title('Price with Moving Average')| kind | Plot | Use For |
|---|---|---|
| line | Line chart | Trends, time series |
| bar / barh | Bar chart | Categories comparison |
| hist | Histogram | Distribution of values |
| box | Box plot | Outliers, quartiles |
| scatter | Scatter | Relationship between vars |
| area | Area chart | Cumulative / composition |
| pie | Pie chart | Proportions |
| kde | Density plot | Smooth distribution |
| hexbin | Hexagonal bin | Large scatter datasets |
df.plot() for quick exploratory plots, then switch to sns for publication-quality figures with better defaults and statistical overlays.# ── Check memory usage ──
df.info(memory_usage='deep')
df.memory_usage(deep=True)
df.memory_usage(deep=True).sum() / (1024**2) # MB
# ── Downcast numeric types ──
df['int_col'] = df['int_col'].astype('int32') # int64 → int32 (50% savings)
df['int_col'] = pd.to_numeric(df['int_col'], downcast='integer')
df['float_col'] = pd.to_numeric(df['float_col'], downcast='float')
# ── Use category for low-cardinality strings ──
df['city'] = df['city'].astype('category')
# Best when unique values << total rows
# ── Use nullable dtypes (pandas 2.0+) ──
df['col'] = df['col'].astype('Int32') # nullable int (supports NaN)
df['col'] = df['col'].astype('Float64') # nullable float
df['col'] = df['col'].astype('string') # nullable string
# ── Sparse data ──
df['sparse_col'] = pd.arrays.SparseArray(df['col'].values)
# ── Optimal dtypes function ──
def optimize_dtypes(df):
for col in df.select_dtypes(include='int64').columns:
df[col] = pd.to_numeric(df[col], downcast='integer')
for col in df.select_dtypes(include='float64').columns:
df[col] = pd.to_numeric(df[col], downcast='float')
for col in df.select_dtypes(include='object').columns:
if df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype('category')
return df# ── Vectorize with .apply vs vectorized ──
# SLOW: row-by-row apply
df.apply(lambda row: row['A'] + row['B'], axis=1)
# FAST: vectorized operation
df['A'] + df['B']
# ── Use str accessor for string ops ──
df['col'].str.upper() # vectorized
# vs: df['col'].apply(str.upper) # slower
# ── Use .eval() for complex expressions ──
df.eval('result = (A + B) * C / D')
df.query('A > B and C < D')
# ── itertuples (faster than iterrows) ──
for row in df.itertuples(index=False):
print(row.name, row.age)
# ── Use .loc for assignment ──
df.loc[mask, 'col'] = new_value # correct way
# ── Chunked reading for large files ──
chunks = pd.read_csv('huge.csv', chunksize=100_000)
result = pd.concat([chunk.groupby('dept').sum() for chunk in chunks])
# ── Parquet for fast I/O ──
df.to_parquet('data.parquet', engine='pyarrow', compression='snappy')
df = pd.read_parquet('data.parquet', columns=['col1', 'col2'])
# ── Copy semantics ──
df2 = df.copy() # true copy
df2 = df.copy(deep=False) # shallow copy (shared data blocks)
# ── Disable copy-on-write (pandas 2.0) ──
pd.set_option('mode.copy_on_write', True)| Type | Bytes | Range |
|---|---|---|
| int8 | 1 | -128 to 127 |
| int16 | 2 | -32K to 32K |
| int32 | 4 | -2.1B to 2.1B |
| int64 | 8 | -9.2E18 to 9.2E18 |
| float16 | 2 | Half precision |
| float32 | 4 | Single precision |
| float64 | 8 | Double precision |
| category | ~0.5-4 | N unique values |
| bool | 1 | True/False |
| Method | Speed | Notes |
|---|---|---|
| Vectorized op | ★★★★★ | df["A"] + df["B"] |
| .str / .dt accessor | ★★★★ | Built-in vectorized |
| .eval() / .query() | ★★★★ | Expression engine |
| .apply() axis=0 | ★★★ | Column-wise function |
| list comprehension | ★★★ | Often faster than apply |
| .itertuples() | ★★ | Loop but fast iteration |
| .iterrows() | ★ | Slow! Avoid in production |
iterrows() or apply(axis=1), there's almost always a vectorized alternative. Always reach for column-wise operations, .str/.dt accessors, or numpy ufuncs first.