Skip to content
Data Manipulation with pandas
Data Manipulation with pandas
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv").head() -imprime los primeros datos, util si tienes varias filas
.info() -imprime info de las columnas nombre, type
.shape - tupla con numero de (filas, columnas)
.describe -algunas estadisticas, media mediana descripcion general
.values -atributo de valor en una matriz(valor bidimensional)
.columns -nombres de las columnas
.index -RangeIndex(start=0, stop=7, step=1)
# Print the head of the homelessness data
print(homelessness.head())
region state individuals family_members state_pop
0 East South Central Alabama 2570.0 864.0 4887681
1 Pacific Alaska 1434.0 582.0 735139
2 Mountain Arizona 7259.0 2606.0 7158024
3 West South Central Arkansas 2280.0 432.0 3009733
4 Pacific California 109008.0 20964.0 39461588
# Print information about homelessness
print(homelessness.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 region 51 non-null object
1 state 51 non-null object
2 individuals 51 non-null float64
3 family_members 51 non-null float64
4 state_pop 51 non-null int64
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ KB
None
# Print the shape of homelessness
print(homelessness.shape)
(51, 5)
# Print a description of homelessness
print(homelessness.describe())
individuals family_members state_pop
count 51.000 51.000 5.100e+01
mean 7225.784 3504.882 6.406e+06
std 15991.025 7805.412 7.327e+06
min 434.000 75.000 5.776e+05
25% 1446.500 592.000 1.777e+06
50% 3082.000 1482.000 4.461e+06
75% 6781.500 3196.000 7.341e+06
max 109008.000 52070.000 3.946e+07Sorting
.sort_values("weight", ascending= False) <- de mayor a menor
dogs.sort_values(["weight_kg", "height_cm"], ascending=[True, False]) <- el primero de de ma a me y el segundo contrario
para solo una columna dogs["name"]
dogs[["name", "height"]]
- el corchete de afuera se encarga de subdividir el Dataframe y los de adentro estan creando una lista de columnas -> subconjuntos
dogs[dogs["height_cm"] > 50]
- con toda las filas
dogs[dogs["breed"] == "Labrador"]
Para juntar condiciones &
- is_lab = dogs["breed"] == "Labrador"
- is_brown = dogs["color"] == "Brown"
dogs[is_lab & is_brown]
dogs[ (dogs["breed"] == "Labrador") & (dogs["color"] == "Brown") ]
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic")| (homelessness["region"] == "Mid-Atlantic")]
# Subsetting using .isin()
is_black_or_brown = dogs["color"].isin(["Black", "Brown"])
dogs[is_black_or_brown]# Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members", ascending=False)
# Print the top few rows
print(homelessness_fam.head())
-----
region state individuals family_members state_pop
32 Mid-Atlantic New York 39827.0 52070.0 19530351
4 Pacific California 109008.0 20964.0 39461588
21 New England Massachusetts 6811.0 13257.0 6882635
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending =[True, False])
# Print the top few rows
print(homelessness_reg_fam.head())
region state individuals family_members state_pop
13 East North Central Illinois 6752.0 3891.0 12723071
35 East North Central Ohio 6929.0 3320.0 11676341
22 East North Central Michigan 5209.0 3142.0 9984072
-----
# Select the individuals column
individuals = homelessness["individuals"]
# Print the head of the result
print(individuals.head())
0 2570.0
1 1434.0
-----
# Select the state and family_members columns
state_fam = homelessness[["state", "family_members"]]
# Print the head of the result
print(state_fam.head())
state family_members
0 Alabama 864.0
1 Alaska 582.0
2 Arizona 2606.0
-----
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"]>10000]
# See the result
print(ind_gt_10k)
region state individuals family_members state_pop
4 Pacific California 109008.0 20964.0 39461588
9 South Atlantic Florida 21443.0 9587.0 21244317
-----
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"]== "Mountain"]
# See the result
print(mountain_reg)
region state individuals family_members state_pop
2 Mountain Arizona 7259.0 2606.0 7158024
5 Mountain Colorado 7607.0 3250.0 5691287
12 Mountain Idaho 1297.0 715.0 1750536
------
# Filter for rows where family_members is less than 1000 and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000)& (homelessness["region"] == "Pacific")]
# See the result
print(fam_lt_1k_pac)
region state individuals family_members state_pop
1 Pacific Alaska 1434.0 582.0 735139
------
# Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic")| (homelessness["region"] == "Mid-Atlantic")]
# See the result
print(south_mid_atlantic)
(south_mid_atlantic)
region state individuals family_members state_pop
7 South Atlantic Delaware 708.0 374.0 965479
30 Mid-Atlantic New Jersey 6048.0 3350.0 8886025
-----
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]
# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
# See the result
print(mojave_homelessness)
region state individuals family_members state_pop
2 Mountain Arizona 7259.0 2606.0 7158024
4 Pacific California 109008.0 20964.0 39461588NEW COLUMN
dogs["height_m"] = dogs["height_cm"] / 100
print(dogs)
dogs["bmi"] = dogs["weight_kg"] / dogs["height_m"] ** 2
print(dogs.head())`# Add total col as sum of individuals and family_members
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"]= homelessness["individuals"]/ homelessness["total"]
# See the result
print(homelessness)
region state individuals family_members state_pop p_individuals total
0 East South Central Alabama 2570.0 864.0 4887681 0.748 3434.0
1 Pacific Alaska 1434.0 582.0 735139 0.711 2016.0# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] /homelessness["state_pop"]
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values(["indiv_per_10k"], ascending=False)
# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
print(result)
state indiv_per_10k
8 District of Columbia 53.738
11 Hawaii 29.079
4 California 27.624
37 Oregon 26.636S U M M A R Y S T A D I S T I C S
.median()
.mode()
.min()
.max()
.var()
.std()
.sum()
.quantile()
.cumsum() suma acumulativa
.cummax() maximo acumulativo
.cummin() minimo acumulativo
.cumprod() producto acumulativo
The .agg() method
def pct30(column): return column.quantile(0.30)
- Esto toma toda una columna y da el percentil 30 de la columna
dogs["weight_kg"].agg(pct30) 22.599999999999998
dogs[["weight_kg", "height_cm"]].agg(pct30)
weight_kg 22.6 height_cm 45.4 dtype: float64
** dogs["weight_kg"].agg([pct30,** pct40]) pct30 22.6 pct40 24.0
# Print the head of the sales DataFrame
print(sales.head())
# Print the info about the sales DataFrame
print(sales.info())
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean())
# Print the median of weekly_sales
print(sales["weekly_sales"].median())
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 store 10774 non-null int64
1 type 10774 non-null object
2 department 10774 non-null int32
dtypes: bool(1), datetime64[ns](1), float64(4), int32(1), int64(1), object(1)
None
23843.95014850566
12049.064999999999
-------
# Print the maximum of the date column
print(sales["date"].max())
2012-10-26 00:00:00
# Print the minimum of the date column
print(sales["date"].min())
2010-02-05 00:00:00I Q R