Data Manipulation with pandas
Run the hidden code cell below to import the data used in this course.
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv")
Some basic methods
Import pandas using the alias pd
import pandas as pd
print(homelessness.head())
print(homelessness.shape)
print(homelessness.describe)
print(homelessness.info())
Print the values of homelessness
print(homelessness.values)
Print the column index of homelessness
print(homelessness.columns)
Print the row index of homelessness
print(homelessness.index)
Sorting and subsetting data
Sort homelessness by individuals
homelessness_ind = homelessness.sort_values('individuals', ascending = "True")
Print the top few rows
print(homelessness_ind.head())
Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members", ascending = False)
Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(['region', 'family_members'], ascending = [True, False])
Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]
Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness.individuals > 10000]
Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness.region == "Mountain"]
Filter for rows where family_members is less than 1000 and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness.family_members < 1000)&(homelessness.region == 'Pacific')]
Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic = homelessness[homelessness.region.isin(["South Atlantic", "Mid-Atlantic"])]
The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]
Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
New colums
Add total col as sum of individuals and family_members
homelessness["total"] = homelessness.individuals + homelessness.family_members
Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"] = homelessness.individuals / homelessness.total
Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness.individuals / homelessness.state_pop
Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness.indiv_per_10k > 20]
Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending = False)
From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[['state', 'indiv_per_10k']]
See the result
print(result)
Basics statistics
Print the head of the sales DataFrame
print(sales.head())
Print the info about the sales DataFrame
print(sales.info())
Print the mean of weekly_sales
print(sales.weekly_sales.mean())
Print the median of weekly_sales
print(sales.weekly_sales.median())
Print the maximum of the date column
print(sales["date"].max())
Print the minimum of the date column
print(sales["date"].min())
A custom IQR function
def iqr(column): return column.quantile(0.75) - column.quantile(0.25)
Print IQR of the temperature_c column
print(sales['temperature_c'].agg(iqr))
Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr))
Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))
########################################################################################
Sort sales_1_1 by date
sales_1_1 = sales_1_1.sort_values("date")
Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()
Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()
See the columns you calculated
print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])
Drop duplicates
Drop duplicate store/type combinations
store_types = sales.drop_duplicates(["store", "type"]) print(store_types.head())
Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(["store", 'department']) print(store_depts.head())
Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales['is_holiday'] == True].drop_duplicates("date")
Print date col of holiday_dates
print(holiday_dates["date"])
Counting
Count the number of stores of each type
store_counts = store_types["type"].value_counts()
print(store_counts)
Get the proportion of stores of each type
store_props = store_types['type'].value_counts(normalize = True)
print(store_props)
Count the number of each department number and sort
dept_counts_sorted = store_depts['department'].value_counts(sort = True)
print(dept_counts_sorted)
Get the proportion of departments of each number and sort
dept_props_sorted = store_depts["department"].value_counts(sort= True, normalize=True)
print(dept_props_sorted)
Grouped summary statistics
Calc total weekly sales
sales_all = sales["weekly_sales"].sum()
Subset for type A stores, calc total weekly sales
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
Subset for type B stores, calc total weekly sales
sales_B = sales[sales.type == "B"]["weekly_sales"].sum()
Subset for type C stores, calc total weekly sales
sales_C = sales[sales.type == "C"]["weekly_sales"].sum()
Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print(sales_propn_by_type)
#########################################################################
Group by type; calc total weekly sales
sales_by_type = sales.groupby("type")["weekly_sales"].sum()
Get proportion for each type
sales_propn_by_type = sales_by_type / sum(sales_by_type)
print(sales_propn_by_type)
Group by type and is_holiday; calc total weekly sales
sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].agg(sum)
print(sales_by_type_is_holiday) ########################################################################
For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])
Print sales_stats
print(sales_stats)
For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby(['type'])['unemployment', 'fuel_price_usd_per_l'].agg([np.min, np.max, np.mean, np.median])
Print unemp_fuel_stats
print(unemp_fuel_stats)
Pivot table
Pivot for mean weekly_sales for each store type
mean_sales_by_type = sales.pivot_table(values = "weekly_sales", index = 'type')
Pivot for mean and median weekly_sales for each store type
mean_med_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type", aggfunc = [np.mean, np.median])
Pivot for mean weekly_sales by store type and holiday
mean_sales_by_type_holiday = sales.pivot_table(values = "weekly_sales", index = "type", columns = "is_holiday")
Print mean weekly_sales by department and type; fill missing values with 0
print(sales.pivot_table(values = "weekly_sales", index = 'type', columns = "department", fill_value = 0))
Print the mean weekly_sales by department and type; fill missing values with 0s; sum all rows and cols
print(sales.pivot_table(values="weekly_sales", index="department", columns="type", margins = True, fill_value = 0))
Expicit indexes
Set the index of temperatures to city
temperatures_ind = temperatures.set_index("city")
Reset the temperatures_ind index, keeping its contents
print(temperatures_ind.reset_index())
Reset the temperatures_ind index, dropping its contents
print(temperatures_ind.reset_index(drop = True))
########################################################################################
Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]
Subset temperatures using square brackets
print(temperatures[temperatures["city"].isin(cities)])
Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])
#######################################################################################
Index temperatures by country & city
temperatures_ind = temperatures.set_index(["country", 'city'])
List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [['Brazil', 'Rio De Janeiro'], ['Pakistan', 'Lahore']]
Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])
######################################################################################
Sort temperatures_ind by index values
print(temperatures_ind.sort_index())
Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level = "city"))
Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level = ["country", "city"], ascending = [True, False]))
Slicing ans subsetting with .loc and .iloc
Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()
Subset rows from Pakistan to Russia
print(temperatures_srt.loc["Pakistan":"Russia"])
Try to subset rows from Lahore to Moscow
print(temperatures_srt.loc['Lahore': 'Moscow']) # No sense
Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[("Pakistan", 'Lahore'): ('Russia', "Moscow")])
Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[('India', 'Hyderabad') : ('Iraq', 'Baghdad')])
Subset columns from date to avg_temp_c
print(temperatures_srt.loc[ : , 'date' : 'avg_temp_c'])
Subset in both directions at once
print(temperatures_srt.loc[('India', 'Hyderabad') : ('Iraq', 'Baghdad'), 'date' : 'avg_temp_c'])
Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures.date >= "2010-01-01") & (temperatures.date <= "2011-12-31")] print(temperatures_bool)
Set date as the index and sort the index
temperatures_ind = temperatures.set_index("date").sort_index()
Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc["2010" : "2011"])
Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc["2010-08" : "2011-02"])
Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22, 1])
Use slicing to get the first 5 rows
print(temperatures.iloc[: 5])
Use slicing to get columns 3 to 4
print(temperatures.iloc[:, 2:4])
Use slicing in both directions at once
print(temperatures.iloc[ : 5, 2:4])
Slicing with pivot tables
Add a year column to temperatures
temperatures["year"] = temperatures.date.dt.year
Pivot avg_temp_c by country and city vs year
temp_by_country_city_vs_year = temperatures.pivot_table("avg_temp_c", index = ["country", 'city'], columns = "year")
See the result
print(temp_by_country_city_vs_year)
Subset for Egypt to India
print(temp_by_country_city_vs_year.loc['Egypt': 'India'])
Subset for Egypt, Cairo to India, Delhi
print(temp_by_country_city_vs_year.loc[("Egypt", "Cairo") : ('India', 'Delhi')])
Subset for Egypt, Cairo to India, Delhi, and 2005 to 2010
print(temp_by_country_city_vs_year.loc[("Egypt", "Cairo") : ('India', 'Delhi'), 2005 : 2010])
Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()
Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year == mean_temp_by_year.max()])
Get the mean temp by city
mean_temp_by_city = temp_by_country_city_vs_year.mean(axis="columns")
Filter for the city that had the lowest mean temp
print(mean_temp_by_city[mean_temp_by_city == mean_temp_by_city.min()])
Vizualisation
Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
Look at the first few rows of data
print(avocados.head())
Get the total number of avocados sold of each size
nb_sold_by_size = avocados.groupby("size")['nb_sold'].sum()
Create a bar plot of the number of avocados sold by size
nb_sold_by_size.plot(kind = "bar")
Show the plot
plt.show()
#####################################################################
Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
Get the total number of avocados sold on each date
nb_sold_by_date = avocados.groupby("date")['nb_sold'].sum()
Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(kind = 'line')
Show the plot
plt.show()
#########################################################################
Scatter plot of avg_price vs. nb_sold with title
avocados.plot(kind = "scatter", x = "nb_sold", y = 'avg_price', title = 'Number of avocados sold vs. average price' )
Show the plot
plt.show()
###########################################################################
Histogram of conventional avg_price
avocados[avocados.type == 'conventional']['avg_price'].hist()
Histogram of organic avg_price
avocados[avocados.type == 'organic']['avg_price'].hist()
Add a legend
plt.legend(["Conventional", 'Organic'])
Show the plot
plt.show()
Modify histogram transparency to 0.5
avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha = 0.5)
Modify histogram transparency to 0.5
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha = 0.5)
Modify bins to 20
avocados[avocados["type"] == "conventional"]["avg_price"].hist(alpha=0.5, bins = 20)
Modify bins to 20
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5, bins = 20)