Course Notes: Exploratory Data Analysis in Python

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

unemployment = pd.read_csv("datasets/clean_unemployment.csv")
# printing the first 5 rows of the dataset unemployment
unemployment.head()

#checking for the data types for each column and if the dataset contains null values
unemployment.info()

# performing summary statistics on the dataset
unemployment.describe()

# ploting a histogram with seaborn
sns.histplot(data=unemployment, x="2021", binwidth=1)

#Data validation
# chaning the datatype of a column
unemployment["2010"] = unemployment["2010"].astype("int")
unemployment.dtypes

# validating categorical data
books = pd.read_csv("datasets/clean_books.csv")
books[books.genre.isin(["Fiction", "Non Fiction"])].head()

# Validating numerical data
books.select_dtypes("number").head()

sns.boxplot(data=books, x="year")

sns.boxplot(data=books, x="year", y="genre")

sns.boxplot(data=unemployment, x="2021", y="continent")

# Data Summarization
# specifying aggregations for columns
books.agg({"rating": ["mean", "std"], "year":["median"]})

books.groupby("genre")["rating"].min()

# named summary columns
books.groupby("genre").agg(mean_rating=("rating", "mean"), std_rating=("rating", "std"), median_year=("year", "median"), max_rating=("rating", "max"), min_rating=("rating", "min"))

import numpy as np
books.pivot_table(values="rating", index="genre", columns="year" , aggfunc=np.min)

‌
‌
‌