Skip to content

Data Manipulation with pandas

Run the hidden code cell below to import the data used in this course.

# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
sales = pd.read_csv("datasets/walmart.csv")

Filtering and Sorteing

# Sorting Columns

hom = homelessness.sort_values('individuals', ascending = False) # Se ordenan en orden contrario (descendente)


tom = homelessness.sort_values(['individuals', "family_members"], ascending= [True, False]) #Se ordenan multiples importa el orden
print(tom.head())

##Filter method

# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness['region'] == "Mountain"]

# See the result
print(mountain_reg)


##More complex sintax

# Filter for rows where family_members is less than 1000 and region is Pacific

fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000) & (homelessness["region"] == "Pacific")]

New Columns


## Creating new columns from an old one
homelessness['individuals_per1k'] = homelessness['individuals']/ 1000
homelessness

### Multiple Manipulation ####################

# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] / homelessness["state_pop"] 

# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]

# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k", ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]

# See the result
print(result)

Summary Statistics

# Statistics : .meadian(), .mode(), .var(), .min(), .max(), .std(), .sum(), .quantile() 

sales["date"].min() # nos entrega el perro mas joben 


#While pandas and NumPy have tons of functions, sometimes, you may need a different function to summarize your data.
# A custom IQR function, work as standar deviation
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)
    
# Now we use the agg method to call function 
# Print IQR of the temperature_c column

print(sales["temperature_c"].agg(iqr))

# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median])) #Agregamos otra funcion

Counting

# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset = ["store", "type"]) #buscara etiquetas unicas que se repitan en ambas columnas stre, typ
print(store_types.head())

# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset = "date") # hara lo mismo, pero solo si holiday es True

# Print date col of holiday_dates
print(holiday_dates["date"])


## More Complex

# Count the number of stores of each type
store_counts = store_types['type'].value_counts()
print(store_counts)

# Get the proportion of stores of each type
store_props = store_types['type'].value_counts(normalize = True) # los guarda como porcentajes
print(store_props)

# Count the number of each department number and sort
dept_counts_sorted = store_types['department'].value_counts(sort = True) # Los ordena de mayor a menor, se puede usar ascending
print(dept_counts_sorted)

# Get the proportion of departments of each number and sort
dept_props_sorted = store_types['department'].value_counts(sort=True, normalize=True)
print(dept_props_sorted)

Group Summarize Statistics

# Import numpy with the alias np
import numpy as np

# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median]) #agrupa segun el type y entrega statis

# Print sales_stats
print(sales_stats)

# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])

# Print unemp_fuel_stats
print(unemp_fuel_stats)

Pivot Table

mean_sales_by_type_holiday = sales.pivot_table(values="weekly_sales", index="type", columns="is_holiday")

# Print mean_sales_by_type_holiday
print(mean_sales_by_type_holiday)

Indexing, Put the selected columns as rows