Skip to content
Course Notes: Exploratory Data Analysis in Python
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pltunemployment = pd.read_csv("datasets/clean_unemployment.csv")
# printing the first 5 rows of the dataset unemployment
unemployment.head()#checking for the data types for each column and if the dataset contains null values
unemployment.info()# performing summary statistics on the dataset
unemployment.describe()# ploting a histogram with seaborn
sns.histplot(data=unemployment, x="2021", binwidth=1)#Data validation
# chaning the datatype of a column
unemployment["2010"] = unemployment["2010"].astype("int")
unemployment.dtypes# validating categorical data
books = pd.read_csv("datasets/clean_books.csv")
books[books.genre.isin(["Fiction", "Non Fiction"])].head()# Validating numerical data
books.select_dtypes("number").head()sns.boxplot(data=books, x="year")sns.boxplot(data=books, x="year", y="genre")sns.boxplot(data=unemployment, x="2021", y="continent")# Data Summarization
# specifying aggregations for columns
books.agg({"rating": ["mean", "std"], "year":["median"]})books.groupby("genre")["rating"].min()# named summary columns
books.groupby("genre").agg(mean_rating=("rating", "mean"), std_rating=("rating", "std"), median_year=("year", "median"), max_rating=("rating", "max"), min_rating=("rating", "min"))import numpy as np
books.pivot_table(values="rating", index="genre", columns="year" , aggfunc=np.min)