Ciclystic_Proyect

Importing Libraries

# Importing Data Science Libraries
import pandas as pd
import numpy as np

# Importing Visualization Libraries
import missingno as msno
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns

Import Dataset from Drive

bike01_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike01_2022.csv")
bike02_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike02_2022.csv")
bike03_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike03_2022.csv")
bike04_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike04_2022.csv")
bike05_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike05_2022.csv")
bike06_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike06_2022.csv")
bike07_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike07_2022.csv")
bike08_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike08_2022.csv")
bike09_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike09_2022.csv")
bike10_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike10_2022.csv")
bike11_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike11_2022.csv")
bike12_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike12_2022.csv")

Concate all tables

Run cancelled

# List of DataFrames
dataframes_list = [bike01_2022, bike02_2022, bike03_2022, bike04_2022, bike05_2022, bike06_2022,
                   bike07_2022, bike08_2022, bike09_2022, bike10_2022, bike11_2022, bike12_2022]

# Checks the columns of all DataFrames
columnas_primer_df = set(dataframes_list[0].columns)

for i, df in enumerate(dataframes_list[1:], start=2):
    if set(df.columns) != columnas_primer_df:
        print(f"Warning: The columns of the DataFrame {i} do not match those of the first DataFrame.")


# Concatenating all dataframes
bike_data_2022 = pd.concat(dataframes_list)

Run cancelled

bike_data_2022.columns

Run cancelled

bike_data_2022.head(20)

Run cancelled

bike_data_2022.info()

Run cancelled

# Convert the values of the column 'Column' to numeric values
bike_data_2022['ended_at'] = pd.to_datetime(bike_data_2022['ended_at'])
bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])

# Now, we will add a new column called 'ride_lenght' which is the subtraction of the columns 'ended_at' and 'started_at'.
bike_data_2022['ride_lenght'] = bike_data_2022['ended_at'] - bike_data_2022['started_at']

print(bike_data_2022[['started_at', 'ended_at', 'ride_lenght']])

Run cancelled

# Convert column 'Date_Time' to data type datetime
bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])

# Create a new column 'day_of_week' containing the name of the day of the week
bike_data_2022['day_of_week'] = bike_data_2022['started_at'].dt.day_name()

print(bike_data_2022[['started_at', 'day_of_week']])

Distribution of null values in the Data Frame

Run cancelled

# Shows the missing data matrix
msno.matrix(bike_data_2022)
plt.title("The missing data matrix")
plt.show()

Porcentage null values in the Data Frame

Run cancelled

# import plotly.graph_objects as go
# Gets the count of non-zero values for each column
non_null_counts = bike_data_2022.count()

# Calculates the count of null values for each column
null_counts = bike_data_2022.isnull().sum()

# Creates the bar chart
fig = go.Figure(data=[
    go.Bar(name='Non null values', x=non_null_counts.index, y=non_null_counts.values),
    go.Bar(name='Missing values', x=null_counts.index, y=null_counts.values)
])

# Change the mode of the bars to 'stack'.
fig.update_layout(barmode='stack')

# Add labels to the axes and a title
fig.update_layout(
    xaxis_title="Column",
    yaxis_title="Count",
    title="Number of Rows with Non-Null and Missing Values per Column",
    title_font=dict(size=20),
    font=dict(size=14),
    height=500,
    template='seaborn'
)

# Show the Graphic.
fig.show()

‌
‌
‌