pandas 2.0 vs polars
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner

    Pandas 2.0 vs polars performance testing for data manipulation

    Install and import libraries

    %%capture
    !pip install polars
    !pip install pandas==2.0.0
    Hidden output
    import numpy as np
    import polars as pl
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    import time
    from string import ascii_letters
    import random
    
    sns.set()
    

    Create synthetic dataset

    def create_table(n):
        np.random.seed(42)
        # Create dataframe with sales information
        df = pd.DataFrame({'id' : [''.join(random.choice(ascii_letters) for x in range(10)) for _ in range(n)],
                           'date' :pd.date_range(start='1980-01-01', periods=n, freq='T'),
                           'office' : np.random.choice(['United States',
                                                        'Brasil','Spain',
                                                        'France','China',
                                                        'United Kingdom',
                                                        'Italy','Canada',
                                                        'India','Argentina'],size=n).astype(str),
                           'sales' : np.random.randint(0,10000, size= n).astype(np.int32),
                          'revenue' : np.random.randint(0,10000, size= n).astype(np.int32)})
        
        # Add some random nan values
       
        cols_list = df.select_dtypes('number').columns.tolist()
            
        for col in df[cols_list]:
            df.loc[df.sample(frac=0.05).index, col] = np.nan
                        
        return df
    def create_table_2(n):
        # Fictional databse with transactions in Italy office during 2022
    
        np.random.seed(42)
        df = pd.DataFrame({'id' : italy_2022_id,
                           'year':2022,
                           'name' : [''.join(random.choice(ascii_letters) for x in range(4)) for _ in range(n)],
                           'surname' : [''.join(random.choice(ascii_letters) for x in range(6)) for _ in range(n)],
                           'responsibility' : np.random.choice(['Sales Director', 
                                                                'Sales Manager',
                                                                'Sales Intern'],size=n).astype(str), 
                           'sex' :  np.random.choice(['male','female','non-binary'], size =n).astype(str)})
                           
        return df
        
    
    df = create_table(22616640)
    #df.to_csv('example.csv',index=False)
     
    Run cancelled
    italy_2022_id = df_pd.query('office == "Italy" and date.dt.year == 2022')['id'].values
    italy_2022 = create_table_2(52568)
    italy_2022_pl = pl.DataFrame(italy_2022)

    Plotting functions

    def reading_comparison(pd_time, pd_pyarrow_time, pl_time, title):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
        times = [pd_time, pd_pyarrow_time, pl_time]
        times = np.round(times, 2)
    
        sns.barplot(x = ['pandas (numpy)','pandas (pyarrow)','polars'], 
                    y = times, 
                    edgecolor='black')
    
    
        ax.set_title(f"{title} Test")
        ax.set_ylabel("Running time (seconds)")
        ax.bar_label(ax.containers[0])
        plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False,  facecolor='white', bbox_inches="tight")
        plt.show()
    def plot_comparison(pd_time, pl_time, title):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
        times = [pd_time, pl_time]
        times = np.round(times, 2)
    
        sns.barplot(x = ['pandas','polars'], 
                    y = times, 
                    edgecolor='black')
    
    
        ax.set_title(f"{title} Test")
        ax.set_ylabel("Running time (seconds)")
        ax.bar_label(ax.containers[0])
        plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False,  facecolor='white', bbox_inches="tight")
        plt.show()

    Reading data test

    s = time.time()
    df_pd = pd.read_csv("./example.csv")
    df_pd = df_pd[['id', 'date', 'office', 'sales']]
    df_pd.query('office =="France"')
    e = time.time()
    pd_time= e-s
    print("pandas Loading Time = {}".format(pd_time))
    s = time.time()
    df_pd_arrow = pd.read_csv("./example.csv", engine="pyarrow")
    df_pd_arrow= df_pd_arrow[['id', 'date', 'office', 'sales']]
    df_pd_arrow.query('office == "France"')
    e = time.time()
    pd_pyarrow_time= e-s
    
    print("pandas pyarrow Loading Time = {}".format(pd_pyarrow_time))