Skip to content
Pandas 2.0 vs polars performance testing for data manipulation
Install and import libraries
%%capture
!pip install polars
!pip install pandas==2.0.0
Hidden output
import numpy as np
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from string import ascii_letters
import random
sns.set()
Create synthetic dataset
def create_table(n):
np.random.seed(42)
# Create dataframe with sales information
df = pd.DataFrame({'id' : [''.join(random.choice(ascii_letters) for x in range(10)) for _ in range(n)],
'date' :pd.date_range(start='1980-01-01', periods=n, freq='T'),
'office' : np.random.choice(['United States',
'Brasil','Spain',
'France','China',
'United Kingdom',
'Italy','Canada',
'India','Argentina'],size=n).astype(str),
'sales' : np.random.randint(0,10000, size= n).astype(np.int32),
'revenue' : np.random.randint(0,10000, size= n).astype(np.int32)})
# Add some random nan values
cols_list = df.select_dtypes('number').columns.tolist()
for col in df[cols_list]:
df.loc[df.sample(frac=0.05).index, col] = np.nan
return df
def create_table_2(n):
# Fictional databse with transactions in Italy office during 2022
np.random.seed(42)
df = pd.DataFrame({'id' : italy_2022_id,
'year':2022,
'name' : [''.join(random.choice(ascii_letters) for x in range(4)) for _ in range(n)],
'surname' : [''.join(random.choice(ascii_letters) for x in range(6)) for _ in range(n)],
'responsibility' : np.random.choice(['Sales Director',
'Sales Manager',
'Sales Intern'],size=n).astype(str),
'sex' : np.random.choice(['male','female','non-binary'], size =n).astype(str)})
return df
df = create_table(22616640)
#df.to_csv('example.csv',index=False)
Run cancelled
italy_2022_id = df_pd.query('office == "Italy" and date.dt.year == 2022')['id'].values
italy_2022 = create_table_2(52568)
italy_2022_pl = pl.DataFrame(italy_2022)
Plotting functions
def reading_comparison(pd_time, pd_pyarrow_time, pl_time, title):
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
times = [pd_time, pd_pyarrow_time, pl_time]
times = np.round(times, 2)
sns.barplot(x = ['pandas (numpy)','pandas (pyarrow)','polars'],
y = times,
edgecolor='black')
ax.set_title(f"{title} Test")
ax.set_ylabel("Running time (seconds)")
ax.bar_label(ax.containers[0])
plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False, facecolor='white', bbox_inches="tight")
plt.show()
def plot_comparison(pd_time, pl_time, title):
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
times = [pd_time, pl_time]
times = np.round(times, 2)
sns.barplot(x = ['pandas','polars'],
y = times,
edgecolor='black')
ax.set_title(f"{title} Test")
ax.set_ylabel("Running time (seconds)")
ax.bar_label(ax.containers[0])
plt.savefig(f'./pandas_vs_polars_{title}.png', transparent=False, facecolor='white', bbox_inches="tight")
plt.show()
Reading data test
s = time.time()
df_pd = pd.read_csv("./example.csv")
df_pd = df_pd[['id', 'date', 'office', 'sales']]
df_pd.query('office =="France"')
e = time.time()
pd_time= e-s
print("pandas Loading Time = {}".format(pd_time))
s = time.time()
df_pd_arrow = pd.read_csv("./example.csv", engine="pyarrow")
df_pd_arrow= df_pd_arrow[['id', 'date', 'office', 'sales']]
df_pd_arrow.query('office == "France"')
e = time.time()
pd_pyarrow_time= e-s
print("pandas pyarrow Loading Time = {}".format(pd_pyarrow_time))