Skip to content
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
unemployment = pd.read_csv("datasets/clean_unemployment.csv")
# printing the first 5 rows of the dataset unemployment
unemployment.head()
#checking for the data types for each column and if the dataset contains null values
unemployment.info()
# performing summary statistics on the dataset
unemployment.describe()
# ploting a histogram with seaborn
sns.histplot(data=unemployment, x="2021", binwidth=1)
#Data validation
# chaning the datatype of a column
unemployment["2010"] = unemployment["2010"].astype("int")
unemployment.dtypes
# validating categorical data
books = pd.read_csv("datasets/clean_books.csv")
books[books.genre.isin(["Fiction", "Non Fiction"])].head()
# Validating numerical data
books.select_dtypes("number").head()
sns.boxplot(data=books, x="year")
sns.boxplot(data=books, x="year", y="genre")
sns.boxplot(data=unemployment, x="2021", y="continent")
# Data Summarization
# specifying aggregations for columns
books.agg({"rating": ["mean", "std"], "year":["median"]})
books.groupby("genre")["rating"].min()
# named summary columns
books.groupby("genre").agg(mean_rating=("rating", "mean"), std_rating=("rating", "std"), median_year=("year", "median"), max_rating=("rating", "max"), min_rating=("rating", "min"))
import numpy as np
books.pivot_table(values="rating", index="genre", columns="year" , aggfunc=np.min)