Skip to content
Ciclystic_Proyect
Importing Libraries
# Importing Data Science Libraries
import pandas as pd
import numpy as np
# Importing Visualization Libraries
import missingno as msno
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
Import Dataset from Drive
bike01_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike01_2022.csv")
bike02_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike02_2022.csv")
bike03_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike03_2022.csv")
bike04_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike04_2022.csv")
bike05_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike05_2022.csv")
bike06_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike06_2022.csv")
bike07_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike07_2022.csv")
bike08_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike08_2022.csv")
bike09_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike09_2022.csv")
bike10_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike10_2022.csv")
bike11_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike11_2022.csv")
bike12_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike12_2022.csv")
Concate all tables
Run cancelled
# List of DataFrames
dataframes_list = [bike01_2022, bike02_2022, bike03_2022, bike04_2022, bike05_2022, bike06_2022,
bike07_2022, bike08_2022, bike09_2022, bike10_2022, bike11_2022, bike12_2022]
# Checks the columns of all DataFrames
columnas_primer_df = set(dataframes_list[0].columns)
for i, df in enumerate(dataframes_list[1:], start=2):
if set(df.columns) != columnas_primer_df:
print(f"Warning: The columns of the DataFrame {i} do not match those of the first DataFrame.")
# Concatenating all dataframes
bike_data_2022 = pd.concat(dataframes_list)
Run cancelled
bike_data_2022.columns
Run cancelled
bike_data_2022.head(20)
Run cancelled
bike_data_2022.info()
Run cancelled
# Convert the values of the column 'Column' to numeric values
bike_data_2022['ended_at'] = pd.to_datetime(bike_data_2022['ended_at'])
bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])
# Now, we will add a new column called 'ride_lenght' which is the subtraction of the columns 'ended_at' and 'started_at'.
bike_data_2022['ride_lenght'] = bike_data_2022['ended_at'] - bike_data_2022['started_at']
print(bike_data_2022[['started_at', 'ended_at', 'ride_lenght']])
Run cancelled
# Convert column 'Date_Time' to data type datetime
bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])
# Create a new column 'day_of_week' containing the name of the day of the week
bike_data_2022['day_of_week'] = bike_data_2022['started_at'].dt.day_name()
print(bike_data_2022[['started_at', 'day_of_week']])
Distribution of null values in the Data Frame
Run cancelled
# Shows the missing data matrix
msno.matrix(bike_data_2022)
plt.title("The missing data matrix")
plt.show()
Porcentage null values in the Data Frame
Run cancelled
# import plotly.graph_objects as go
# Gets the count of non-zero values for each column
non_null_counts = bike_data_2022.count()
# Calculates the count of null values for each column
null_counts = bike_data_2022.isnull().sum()
# Creates the bar chart
fig = go.Figure(data=[
go.Bar(name='Non null values', x=non_null_counts.index, y=non_null_counts.values),
go.Bar(name='Missing values', x=null_counts.index, y=null_counts.values)
])
# Change the mode of the bars to 'stack'.
fig.update_layout(barmode='stack')
# Add labels to the axes and a title
fig.update_layout(
xaxis_title="Column",
yaxis_title="Count",
title="Number of Rows with Non-Null and Missing Values per Column",
title_font=dict(size=20),
font=dict(size=14),
height=500,
template='seaborn'
)
# Show the Graphic.
fig.show()