Skip to content

Data validation & Cleaning

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sp = pd.read_csv('space.csv')
# Renaming multiple columns
sp = sp.rename(columns={
    'Company': 'company',
    'Location': 'location',
    'Mission': 'mission',
    'Date': 'date',
    'Time': 'time',
    'Rocket': 'rocket',
    'RocketStatus': 'rocketstatus',
    'Price': 'price',
    'MissionStatus': 'missionstatus'
})
Hidden code
#changing datatypes
sp['company'] = sp['company'].astype(str)
sp['location'] = sp['location'].astype(str)
sp['mission'] = sp['mission'].astype(str)
sp['rocket'] = sp['rocket'].astype(str)
sp['rocketstatus'] = sp['rocketstatus'].astype(str)
sp['facility'] = sp['facility'].astype(str)
sp['missionstatus'] = sp['missionstatus'].astype(str)
Hidden output
# there are missing values in the time column

# first step is to Convert 'time' column to datetime objects to get rid of the AM/PM
sp['time'] = pd.to_datetime(sp['time'], format='%I:%M:%S %p').dt.time

#second step is to convert back to string 
sp['time'] = pd.to_timedelta(sp['time'].astype(str))

# third step is to calculate the mean time per country and facility and put it in a dictionary
mean_time= sp.groupby(['facility','country'])['time'].mean()
mean_time_dict = mean_time.to_dict()

# fourth step is filling missing values with the mean put in dictionary
sp['time'] = sp['time'].fillna(sp['time'].map(mean_time_dict))

#there are still missing values so we fill the balance with the mean of the dataset
m_time = sp['time'].mean()
sp['time'] = sp['time'].fillna(m_time)
#checking the number of null values per column
null = ['location','company','facility','mission','date'
             ,'time','rocket','rocketstatus','price','missionstatus','country']
null_count = sp[null].isnull().sum()
print(null_count)
#there are missing values in the price column

#first step: getting the mean price of country and facility
mean_price = sp.groupby(['facility','country'])['price'].mean()

# putting the mean in a dictionary
price_dict = mean_price.to_dict()

# filling blanks with mean price
sp['price'] = sp['time'].fillna(sp['price'].map(price_dict))

Objective one: How have rocket launches trended across time? Has mission success rate increased?

#making a copy of the df to work with
import copy
sp_c = copy.copy(sp)
#converting the date column to date format
sp_c['year'] = pd.to_datetime(sp_c['date']).dt.year

#  total mission success 
mission_success = sp_c[sp_c['missionstatus'] == 'Success']

#mission success per year
mission_success_per_year =                          mission_success.groupby('year').size().rename('success_rate').reset_index()

print(mission_success_per_year)
Hidden output
#  total mission failure
mission_failure = sp_c[sp_c['missionstatus'] == 'Failure']

#mission failure per year
mission_failure_per_year = mission_failure.groupby('year').size().rename('failure_rate').reset_index()
print(mission_failure_per_year)
Hidden output
#converting success & failure into a df
success = pd.DataFrame(mission_success_per_year)
failure = pd.DataFrame(mission_failure_per_year)