Skip to content

EDA Practice in Pandas - Chris Gochis

This notebook contains some practice EDA on an automotive dataset. The goal was to extract insights using descriptive statistics and charts.

# Import Packages and Modules
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#From Scikit Learn
from sklearn import preprocessing
from sklearn.model_selection  import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

Read in AUTO.csv file into a data frame and call it Auto.

#Import csv into Pandas Dataset called Auto
Auto = pd.read_csv('AUTO.csv', sep = ",", encoding = 'ISO-8859-1')
#Check the dataset head
Auto.head()
#Check dataset tail
Auto.tail()

#It appears that the CVS was read in with rows 106-189 full of NaN. Let's remove these.
#Drop rows at bottom with no data
Auto = Auto.drop(Auto.index[106:190])
#Check Tail again
Auto.tail()

#NaN rows are gone!
#Check Data Types of Columns
Auto.dtypes
#Weight is shown as object
#Need to remove commas from Weight and convert to a float
Auto['Weight (lbs)']= Auto['Weight (lbs)'].str.replace(',','')
Auto['Weight (lbs)'] = Auto['Weight (lbs)'].astype(float)
Auto['Weight (lbs)']
#Confirm type conversion worked
Auto.dtypes

#Sucess!

Replace any missing values with the median value of the variable (feature).

#Check to see if any columns have a NaN

Auto.isna().any()

#Luggage has NaN values.
#Find Luggage values of NaN.
Auto[85:90]

#Rows 87 and 88 are NaN