Skip to content
Ciclystic_Proyect
  • AI Chat
  • Code
  • Report
  • Importing Libraries

    # Importing Data Science Libraries
    import pandas as pd
    import numpy as np
    
    # Importing Visualization Libraries
    import missingno as msno
    import matplotlib.pyplot as plt
    from geopy.distance import geodesic
    import matplotlib.pyplot as plt
    import plotly.graph_objects as go
    import seaborn as sns

    Import Dataset from Drive

    bike01_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike01_2022.csv")
    bike02_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike02_2022.csv")
    bike03_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike03_2022.csv")
    bike04_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike04_2022.csv")
    bike05_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike05_2022.csv")
    bike06_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike06_2022.csv")
    bike07_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike07_2022.csv")
    bike08_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike08_2022.csv")
    bike09_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike09_2022.csv")
    bike10_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike10_2022.csv")
    bike11_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike11_2022.csv")
    bike12_2022 = pd.read_csv("/content/drive/MyDrive/DataSet_Ciclystic/bike12_2022.csv")

    Concate all tables

    Run cancelled
    # List of DataFrames
    dataframes_list = [bike01_2022, bike02_2022, bike03_2022, bike04_2022, bike05_2022, bike06_2022,
                       bike07_2022, bike08_2022, bike09_2022, bike10_2022, bike11_2022, bike12_2022]
    
    # Checks the columns of all DataFrames
    columnas_primer_df = set(dataframes_list[0].columns)
    
    for i, df in enumerate(dataframes_list[1:], start=2):
        if set(df.columns) != columnas_primer_df:
            print(f"Warning: The columns of the DataFrame {i} do not match those of the first DataFrame.")
    
    
    # Concatenating all dataframes
    bike_data_2022 = pd.concat(dataframes_list)
    Run cancelled
    bike_data_2022.columns
    Run cancelled
    bike_data_2022.head(20)
    Run cancelled
    bike_data_2022.info()
    Run cancelled
    # Convert the values of the column 'Column' to numeric values
    bike_data_2022['ended_at'] = pd.to_datetime(bike_data_2022['ended_at'])
    bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])
    
    # Now, we will add a new column called 'ride_lenght' which is the subtraction of the columns 'ended_at' and 'started_at'.
    bike_data_2022['ride_lenght'] = bike_data_2022['ended_at'] - bike_data_2022['started_at']
    
    print(bike_data_2022[['started_at', 'ended_at', 'ride_lenght']])
    Run cancelled
    # Convert column 'Date_Time' to data type datetime
    bike_data_2022['started_at'] = pd.to_datetime(bike_data_2022['started_at'])
    
    # Create a new column 'day_of_week' containing the name of the day of the week
    bike_data_2022['day_of_week'] = bike_data_2022['started_at'].dt.day_name()
    
    print(bike_data_2022[['started_at', 'day_of_week']])

    Distribution of null values in the Data Frame

    Run cancelled
    # Shows the missing data matrix
    msno.matrix(bike_data_2022)
    plt.title("The missing data matrix")
    plt.show()

    Porcentage null values in the Data Frame

    Run cancelled
    # import plotly.graph_objects as go
    # Gets the count of non-zero values for each column
    non_null_counts = bike_data_2022.count()
    
    # Calculates the count of null values for each column
    null_counts = bike_data_2022.isnull().sum()
    
    # Creates the bar chart
    fig = go.Figure(data=[
        go.Bar(name='Non null values', x=non_null_counts.index, y=non_null_counts.values),
        go.Bar(name='Missing values', x=null_counts.index, y=null_counts.values)
    ])
    
    # Change the mode of the bars to 'stack'.
    fig.update_layout(barmode='stack')
    
    # Add labels to the axes and a title
    fig.update_layout(
        xaxis_title="Column",
        yaxis_title="Count",
        title="Number of Rows with Non-Null and Missing Values per Column",
        title_font=dict(size=20),
        font=dict(size=14),
        height=500,
        template='seaborn'
    )
    
    # Show the Graphic.
    fig.show()