Importing Pandas
import pandas as pd
Reading Data
# Read CSV file
= pd.read_csv('filename.csv')
df
# Read Excel file
= pd.read_excel('filename.xlsx') df
Exploring the DataFrame
# Display first few rows
df.head()
# Display basic information about the DataFrame
df.info()
# Get summary statistics
df.describe()
# Check for missing values
sum() df.isnull().
Handling Missing Data
# Drop rows with any missing values
= df.dropna()
df_cleaned
# Fill missing values with a specific value
'column_name'].fillna(0, inplace=True)
df[
# Fill missing values with the mean of the column
'column_name'].fillna(df['column_name'].mean(), inplace=True) df[
Removing Duplicates
# Remove duplicate rows
= df.drop_duplicates()
df_unique
# Remove duplicates based on specific columns
= df.drop_duplicates(subset=['column1', 'column2']) df_unique
Renaming Columns
# Rename a single column
= df.rename(columns={'old_name': 'new_name'})
df
# Rename multiple columns
= df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'}) df
Changing Data Types
# Convert a column to float
'column_name'] = df['column_name'].astype(float)
df[
# Convert column to numeric type
'column_name'] = pd.to_numeric(df['column_name'], errors='coerce')
df[
# Convert column to datetime
'date_column'] = pd.to_datetime(df['date_column'])
df[
# Convert a column to string
'column_name'] = df['column_name'].astype(`string`) df[
Filtering Data
# Filter rows based on a condition
= df[df['column_name'] > 5]
df_filtered
# Filter rows based on multiple conditions
= df[(df['column1'] > 5) & (df['column2'] < 10)] df_filtered
Handling Outliers
# Remove outliers using Z-score
from scipy import stats
# Only keep data with a Z-score < 3
= df[(np.abs(stats.zscore(df['column_name'])) < 3)]
df_no_outliers
# Cap outliers at a specific percentile
= df['column_name'].quantile(0.05)
lower = df['column_name'].quantile(0.95)
upper 'column_name'] = df['column_name'].clip(lower, upper) df[
Resources for More Information
Remember, these are just some of the most common operations for cleaning DataFrames. As you become more comfortable with pandas, you’ll discover many more powerful functions and methods to help you clean and manipulate your data effectively.