Skip to content
Course Notes: Exploratory Data Analysis in Python
Course Notes
Getting to Know a Dataset
# Import any packages you want to use here
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as npBasic exploration
books = pd.read_csv("datasets/clean_books.csv")# first five lines in dataframa
books.head()# info about dataframe, esp. data types and number of missing values
books.info()# stats for numerical attributes
books.describe()# count of attribute values for a specific attribute
books["genre"].value_counts() # basic histogram in Seaborn
sns.histplot(data=books, x="rating", binwidth=0.1)
plt.show()Data validation
# show data types
books.dtypes# change data type
books["year"] = books["year"].astype(int)# validate categorical values
books["genre"].isin(["Fiction", "Non Fiction"])
# returns Boolean values of same length# use tilde to negate
~books["genre"].isin(["Fiction", "Non Fiction"])# put it in brackets for filtering
books[books["genre"].isin(["Fiction", "Non Fiction"])]