Skip to content

Data Manipulation with pandas

Run the hidden code cell below to import the data used in this course.

# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv")

Some basic methods

Import pandas using the alias pd

import pandas as pd

print(homelessness.head())

print(homelessness.shape)

print(homelessness.describe)

print(homelessness.info())

Print the values of homelessness

print(homelessness.values)

Print the column index of homelessness

print(homelessness.columns)

Print the row index of homelessness

print(homelessness.index)

Sorting and subsetting data

Sort homelessness by individuals

homelessness_ind = homelessness.sort_values('individuals', ascending = "True")

Print the top few rows

print(homelessness_ind.head())

Sort homelessness by descending family members

homelessness_fam = homelessness.sort_values("family_members", ascending = False)

Sort homelessness by region, then descending family members

homelessness_reg_fam = homelessness.sort_values(['region', 'family_members'], ascending = [True, False])

Select only the individuals and state columns, in that order

ind_state = homelessness[["individuals", "state"]]

Filter for rows where individuals is greater than 10000

ind_gt_10k = homelessness[homelessness.individuals > 10000]

Filter for rows where region is Mountain

mountain_reg = homelessness[homelessness.region == "Mountain"]

Filter for rows where family_members is less than 1000 and region is Pacific

fam_lt_1k_pac = homelessness[(homelessness.family_members < 1000)&(homelessness.region == 'Pacific')]

Subset for rows in South Atlantic or Mid-Atlantic regions

south_mid_atlantic = homelessness[homelessness.region.isin(["South Atlantic", "Mid-Atlantic"])]

The Mojave Desert states

canu = ["California", "Arizona", "Nevada", "Utah"]

Filter for rows in the Mojave Desert states

mojave_homelessness = homelessness[homelessness["state"].isin(canu)]

New colums

Add total col as sum of individuals and family_members

homelessness["total"] = homelessness.individuals + homelessness.family_members

Add p_individuals col as proportion of total that are individuals

homelessness["p_individuals"] = homelessness.individuals / homelessness.total

Create indiv_per_10k col as homeless individuals per 10k state pop

homelessness["indiv_per_10k"] = 10000 * homelessness.individuals / homelessness.state_pop

Subset rows for indiv_per_10k greater than 20

high_homelessness = homelessness[homelessness.indiv_per_10k > 20]

Sort high_homelessness by descending indiv_per_10k

high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending = False)

From high_homelessness_srt, select the state and indiv_per_10k cols

result = high_homelessness_srt[['state', 'indiv_per_10k']]

See the result

print(result)

Basics statistics

Print the head of the sales DataFrame

print(sales.head())

Print the info about the sales DataFrame

print(sales.info())

Print the mean of weekly_sales

print(sales.weekly_sales.mean())

Print the median of weekly_sales

print(sales.weekly_sales.median())

Print the maximum of the date column

print(sales["date"].max())

Print the minimum of the date column

print(sales["date"].min())

A custom IQR function

def iqr(column): return column.quantile(0.75) - column.quantile(0.25)

Print IQR of the temperature_c column

print(sales['temperature_c'].agg(iqr))

Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment

print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))

Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment

print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))

########################################################################################

Sort sales_1_1 by date

sales_1_1 = sales_1_1.sort_values("date")

Get the cumulative sum of weekly_sales, add as cum_weekly_sales col

sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()

Get the cumulative max of weekly_sales, add as cum_max_sales col

sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()

See the columns you calculated

print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

Drop duplicates

Drop duplicate store/type combinations

store_types = sales.drop_duplicates(["store", "type"]) print(store_types.head())

Drop duplicate store/department combinations

store_depts = sales.drop_duplicates(["store", 'department']) print(store_depts.head())

Subset the rows where is_holiday is True and drop duplicate dates

holiday_dates = sales[sales['is_holiday'] == True].drop_duplicates("date")

Print date col of holiday_dates

print(holiday_dates["date"])

Counting

Count the number of stores of each type

store_counts = store_types["type"].value_counts()

print(store_counts)

Get the proportion of stores of each type

store_props = store_types['type'].value_counts(normalize = True)

print(store_props)

Count the number of each department number and sort

dept_counts_sorted = store_depts['department'].value_counts(sort = True)

print(dept_counts_sorted)

Get the proportion of departments of each number and sort

dept_props_sorted = store_depts["department"].value_counts(sort= True, normalize=True)

print(dept_props_sorted)

Grouped summary statistics

Calc total weekly sales

sales_all = sales["weekly_sales"].sum()

Subset for type A stores, calc total weekly sales

sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()

Subset for type B stores, calc total weekly sales

sales_B = sales[sales.type == "B"]["weekly_sales"].sum()

Subset for type C stores, calc total weekly sales

sales_C = sales[sales.type == "C"]["weekly_sales"].sum()

Get proportion for each type

sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all

print(sales_propn_by_type)

#########################################################################

Group by type; calc total weekly sales

sales_by_type = sales.groupby("type")["weekly_sales"].sum()

Get proportion for each type

sales_propn_by_type = sales_by_type / sum(sales_by_type)

print(sales_propn_by_type)

Group by type and is_holiday; calc total weekly sales

sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].agg(sum)

print(sales_by_type_is_holiday) ########################################################################

For each store type, aggregate weekly_sales: get min, max, mean, and median

sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])

Print sales_stats

print(sales_stats)

For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median

unemp_fuel_stats = sales.groupby(['type'])['unemployment', 'fuel_price_usd_per_l'].agg([np.min, np.max, np.mean, np.median])

Print unemp_fuel_stats

print(unemp_fuel_stats)

Pivot table

Pivot for mean weekly_sales for each store type

mean_sales_by_type = sales.pivot_table(values = "weekly_sales", index = 'type')

Pivot for mean and median weekly_sales for each store type

mean_med_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type", aggfunc = [np.mean, np.median])

Pivot for mean weekly_sales by store type and holiday

mean_sales_by_type_holiday = sales.pivot_table(values = "weekly_sales", index = "type", columns = "is_holiday")

Print mean weekly_sales by department and type; fill missing values with 0

print(sales.pivot_table(values = "weekly_sales", index = 'type', columns = "department", fill_value = 0))

Print the mean weekly_sales by department and type; fill missing values with 0s; sum all rows and cols

print(sales.pivot_table(values="weekly_sales", index="department", columns="type", margins = True, fill_value = 0))

Expicit indexes

Set the index of temperatures to city

temperatures_ind = temperatures.set_index("city")

Reset the temperatures_ind index, keeping its contents

print(temperatures_ind.reset_index())

Reset the temperatures_ind index, dropping its contents

print(temperatures_ind.reset_index(drop = True))

########################################################################################

Make a list of cities to subset on

cities = ["Moscow", "Saint Petersburg"]

Subset temperatures using square brackets

print(temperatures[temperatures["city"].isin(cities)])

Subset temperatures_ind using .loc[]

print(temperatures_ind.loc[cities])

#######################################################################################

Index temperatures by country & city

temperatures_ind = temperatures.set_index(["country", 'city'])

List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore

rows_to_keep = [['Brazil', 'Rio De Janeiro'], ['Pakistan', 'Lahore']]

Subset for rows to keep

print(temperatures_ind.loc[rows_to_keep])

######################################################################################

Sort temperatures_ind by index values

print(temperatures_ind.sort_index())

Sort temperatures_ind by index values at the city level

print(temperatures_ind.sort_index(level = "city"))

Sort temperatures_ind by country then descending city

print(temperatures_ind.sort_index(level = ["country", "city"], ascending = [True, False]))

Slicing ans subsetting with .loc and .iloc

Sort the index of temperatures_ind

temperatures_srt = temperatures_ind.sort_index()

Subset rows from Pakistan to Russia

print(temperatures_srt.loc["Pakistan":"Russia"])

Try to subset rows from Lahore to Moscow

print(temperatures_srt.loc['Lahore': 'Moscow']) # No sense

Subset rows from Pakistan, Lahore to Russia, Moscow

print(temperatures_srt.loc[("Pakistan", 'Lahore'): ('Russia', "Moscow")])

Subset rows from India, Hyderabad to Iraq, Baghdad

print(temperatures_srt.loc[('India', 'Hyderabad') : ('Iraq', 'Baghdad')])

Subset columns from date to avg_temp_c

print(temperatures_srt.loc[ : , 'date' : 'avg_temp_c'])

Subset in both directions at once

print(temperatures_srt.loc[('India', 'Hyderabad') : ('Iraq', 'Baghdad'), 'date' : 'avg_temp_c'])

Use Boolean conditions to subset temperatures for rows in 2010 and 2011

temperatures_bool = temperatures[(temperatures.date >= "2010-01-01") & (temperatures.date <= "2011-12-31")] print(temperatures_bool)

Set date as the index and sort the index

temperatures_ind = temperatures.set_index("date").sort_index()

Use .loc[] to subset temperatures_ind for rows in 2010 and 2011

print(temperatures_ind.loc["2010" : "2011"])

Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011

print(temperatures_ind.loc["2010-08" : "2011-02"])

Get 23rd row, 2nd column (index 22, 1)

print(temperatures.iloc[22, 1])

Use slicing to get the first 5 rows

print(temperatures.iloc[: 5])

Use slicing to get columns 3 to 4

print(temperatures.iloc[:, 2:4])

Use slicing in both directions at once

print(temperatures.iloc[ : 5, 2:4])

Slicing with pivot tables

Add a year column to temperatures

temperatures["year"] = temperatures.date.dt.year

Pivot avg_temp_c by country and city vs year

temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = ["country", 'city'], columns = "year")

See the result

print(temp_by_country_city_vs_year)

Subset for Egypt to India

print(temp_by_country_city_vs_year.loc['Egypt': 'India'])

Subset for Egypt, Cairo to India, Delhi

print(temp_by_country_city_vs_year.loc[("Egypt", "Cairo") : ('India', 'Delhi')])

Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010

print(temp_by_country_city_vs_year.loc[("Egypt", "Cairo") : ('India', 'Delhi'), 2005 : 2010])

Get the worldwide mean temp by year

mean_temp_by_year = temp_by_country_city_vs_year.mean()

Filter for the year that had the highest mean temp

print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])

Get the mean temp by city

mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")

Filter for the city that had the lowest mean temp

print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])

Vizualisation

Import matplotlib.pyplot with alias plt

import matplotlib.pyplot as plt

Look at the first few rows of data

print(avocados.head())

Get the total number of avocados sold of each size

nb_sold_by_size = avocados.groupby("size")['nb_sold'].sum()

Create a bar plot of the number of avocados sold by size

nb_sold_by_size.plot(kind = "bar")

Show the plot

plt.show()

#####################################################################

Import matplotlib.pyplot with alias plt

import matplotlib.pyplot as plt

Get the total number of avocados sold on each date

nb_sold_by_date = avocados.groupby("date")['nb_sold'].sum()

Create a line plot of the number of avocados sold by date

nb_sold_by_date.plot(kind = 'line')

Show the plot

plt.show()

#########################################################################

Scatter plot of avg_price vs. nb_sold with title

avocados.plot(kind = "scatter", x = "nb_sold", y = 'avg_price', title = 'Number of avocados sold vs. average price' )

Show the plot

plt.show()

###########################################################################

Histogram of conventional avg_price

avocados[avocados.type == 'conventional']['avg_price'].hist()

Histogram of organic avg_price

avocados[avocados.type == 'organic']['avg_price'].hist()

Add a legend

plt.legend(["Conventional", 'Organic'])

Show the plot

plt.show()

Modify histogram transparency to 0.5

avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha = 0.5)

Modify histogram transparency to 0.5

avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha = 0.5)

Modify bins to 20

avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5, bins = 20)

Modify bins to 20

avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5, bins = 20)