Skip to content

Cleaning Data in Python

For each DataFrame, inspect the data types of each column and, where needed, clean and convert columns into the correct data type. You should also rename any columns to have more descriptive titles.

%%capture
# Install fuzzywuzzy
!pip install fuzzywuzzy
# Importing course packages; you can add more too!
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import missingno as msno
import fuzzywuzzy
import recordlinkage 

# Importing course datasets as DataFrames
ride_sharing = pd.read_csv('datasets/ride_sharing_new.csv', index_col = 'Unnamed: 0')
airlines = pd.read_csv('datasets/airlines_final.csv',  index_col = 'Unnamed: 0')
banking = pd.read_csv('datasets/banking_dirty.csv', index_col = 'Unnamed: 0')
restaurants = pd.read_csv('datasets/restaurants_L2.csv', index_col = 'Unnamed: 0')
restaurants_new = pd.read_csv('datasets/restaurants_L2_dirty.csv', index_col = 'Unnamed: 0')

ride_sharing.head() # Display the first five rows of this DataFrame

Identify and remove all the duplicate rows in ride_sharing.

#print the info of ride_sharing
print(ride_sharing.info())
#print the number of missing values in ride_sharing
print(ride_sharing.isna().sum())
#print unique values of user_type column
print(ride_sharing["user_type"].unique())
#convert user_type from integer to category
ride_sharing["user_type"]=ride_sharing["user_type"].astype("category")
#write an assert statement confirming the change
assert ride_sharing["user_type"].dtypes == "category"
#print new info of user_type of ride_sharing
print(ride_sharing["user_type"].info())
#print unique values of user_gender column
print(ride_sharing["user_gender"].unique())
#convert user_gender from object to category
ride_sharing["user_gender"] = ride_sharing["user_gender"].astype("category")
#write an assert statement confirming the change
assert ride_sharing["user_gender"].dtypes == "category"
#print new info of user_gender of ride_sharing
print(ride_sharing["user_gender"].info())
#strip duration of minutes
ride_sharing["duration"] = ride_sharing["duration"].str.strip("minutes")
#convert duration to integers
ride_sharing["duration"] = ride_sharing["duration"].astype('int')
#write an assert statement making sure of the conversion
assert ride_sharing["duration"].dtype == 'int'
#print the calculated average ride duration
print(ride_sharing["duration"].mean())