Skip to content
Course Notes: Exploratory Data Analysis in Python
Course Notes
Getting to Know a Dataset
# Import any packages you want to use here
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
Basic exploration
books = pd.read_csv("datasets/clean_books.csv")
# first five lines in dataframa
books.head()
# info about dataframe, esp. data types and number of missing values
books.info()
# stats for numerical attributes
books.describe()
# count of attribute values for a specific attribute
books["genre"].value_counts()
# basic histogram in Seaborn
sns.histplot(data=books, x="rating", binwidth=0.1)
plt.show()
Data validation
# show data types
books.dtypes
# change data type
books["year"] = books["year"].astype(int)
# validate categorical values
books["genre"].isin(["Fiction", "Non Fiction"])
# returns Boolean values of same length
# use tilde to negate
~books["genre"].isin(["Fiction", "Non Fiction"])
# put it in brackets for filtering
books[books["genre"].isin(["Fiction", "Non Fiction"])]