Skip to content
Data Manipulation with pandas
Data Manipulation with pandas
Run the hidden code cell below to import the data used in this course.
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
sales = pd.read_csv("datasets/walmart.csv")
Filtering and Sorteing
# Sorting Columns
hom = homelessness.sort_values('individuals', ascending = False) # Se ordenan en orden contrario (descendente)
tom = homelessness.sort_values(['individuals', "family_members"], ascending= [True, False]) #Se ordenan multiples importa el orden
print(tom.head())
##Filter method
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness['region'] == "Mountain"]
# See the result
print(mountain_reg)
##More complex sintax
# Filter for rows where family_members is less than 1000 and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & (homelessness["region"] == "Pacific")]
New Columns
## Creating new columns from an old one
homelessness['individuals_per1k'] = homelessness['individuals']/ 1000
homelessness
### Multiple Manipulation ####################
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"]
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)
# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
print(result)Summary Statistics
# Statistics : .meadian(), .mode(), .var(), .min(), .max(), .std(), .sum(), .quantile()
sales["date"].min() # nos entrega el perro mas joben
#While pandas and NumPy have tons of functions, sometimes, you may need a different function to summarize your data.
# A custom IQR function, work as standar deviation
def iqr(column):
return column.quantile(0.75) - column.quantile(0.25)
# Now we use the agg method to call function
# Print IQR of the temperature_c column
print(sales["temperature_c"].agg(iqr))
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median])) #Agregamos otra funcion
Counting
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset = ["store", "type"]) #buscara etiquetas unicas que se repitan en ambas columnas stre, typ
print(store_types.head())
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset = "date") # hara lo mismo, pero solo si holiday es True
# Print date col of holiday_dates
print(holiday_dates["date"])
## More Complex
# Count the number of stores of each type
store_counts = store_types['type'].value_counts()
print(store_counts)
# Get the proportion of stores of each type
store_props = store_types['type'].value_counts(normalize = True) # los guarda como porcentajes
print(store_props)
# Count the number of each department number and sort
dept_counts_sorted = store_types['department'].value_counts(sort = True) # Los ordena de mayor a menor, se puede usar ascending
print(dept_counts_sorted)
# Get the proportion of departments of each number and sort
dept_props_sorted = store_types['department'].value_counts(sort=True, normalize=True)
print(dept_props_sorted)Group Summarize Statistics
# Import numpy with the alias np
import numpy as np
# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median]) #agrupa segun el type y entrega statis
# Print sales_stats
print(sales_stats)
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
# Print unemp_fuel_stats
print(unemp_fuel_stats)Pivot Table
mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns="is_holiday")
# Print mean_sales_by_type_holiday
print(mean_sales_by_type_holiday)Indexing, Put the selected columns as rows