Skip to content

Data Manipulation with pandas

# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv")

.head() -imprime los primeros datos, util si tienes varias filas

.info() -imprime info de las columnas nombre, type

.shape - tupla con numero de (filas, columnas)

.describe -algunas estadisticas, media mediana descripcion general

.values -atributo de valor en una matriz(valor bidimensional)

.columns -nombres de las columnas

.index -RangeIndex(start=0, stop=7, step=1)

# Print the head of the homelessness data
print(homelessness.head())

               region       state  individuals  family_members  state_pop
0  East South Central     Alabama       2570.0           864.0    4887681
1             Pacific      Alaska       1434.0           582.0     735139
2            Mountain     Arizona       7259.0          2606.0    7158024
3  West South Central    Arkansas       2280.0           432.0    3009733
4             Pacific  California     109008.0         20964.0   39461588


# Print information about homelessness
print(homelessness.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ KB
None    
    
# Print the shape of homelessness
print(homelessness.shape)
(51, 5)


# Print a description of homelessness
print(homelessness.describe())

       individuals  family_members  state_pop
count       51.000          51.000  5.100e+01
mean      7225.784        3504.882  6.406e+06
std      15991.025        7805.412  7.327e+06
min        434.000          75.000  5.776e+05
25%       1446.500         592.000  1.777e+06
50%       3082.000        1482.000  4.461e+06
75%       6781.500        3196.000  7.341e+06
max     109008.000       52070.000  3.946e+07

Sorting

.sort_values("weight", ascending= False) <- de mayor a menor

dogs.sort_values(["weight_kg", "height_cm"], ascending=[True, False]) <- el primero de de ma a me y el segundo contrario

para solo una columna dogs["name"]

dogs[["name", "height"]]

  • el corchete de afuera se encarga de subdividir el Dataframe y los de adentro estan creando una lista de columnas -> subconjuntos

dogs[dogs["height_cm"] > 50]

  • con toda las filas

dogs[dogs["breed"] == "Labrador"]

Para juntar condiciones &

  • is_lab = dogs["breed"] == "Labrador"
  • is_brown = dogs["color"] == "Brown"

dogs[is_lab & is_brown]

dogs[ (dogs["breed"] == "Labrador") & (dogs["color"] == "Brown") ]

south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic")| (homelessness["region"] == "Mid-Atlantic")]

# Subsetting using .isin()
is_black_or_brown = dogs["color"].isin(["Black", "Brown"])
dogs[is_black_or_brown]
# Sort homelessness by descending family members
homelessness_fam = homelessness.sort_values("family_members", ascending=False)
# Print the top few rows
print(homelessness_fam.head())
-----

                region          state  individuals  family_members  state_pop
32        Mid-Atlantic       New York      39827.0         52070.0   19530351
4              Pacific     California     109008.0         20964.0   39461588
21         New England  Massachusetts       6811.0         13257.0    6882635

# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"], ascending =[True, False])
# Print the top few rows
print(homelessness_reg_fam.head())

                    region      state  individuals  family_members  state_pop
    13  East North Central   Illinois       6752.0          3891.0   12723071
    35  East North Central       Ohio       6929.0          3320.0   11676341
    22  East North Central   Michigan       5209.0          3142.0    9984072
-----    
# Select the individuals column
individuals = homelessness["individuals"]
# Print the head of the result
print(individuals.head())

0      2570.0
1      1434.0
-----
# Select the state and family_members columns
state_fam = homelessness[["state", "family_members"]]
# Print the head of the result
print(state_fam.head())

       state  family_members
0     Alabama           864.0
1      Alaska           582.0
2     Arizona          2606.0
-----
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness["individuals"]>10000]
# See the result
print(ind_gt_10k)

                region       state  individuals  family_members  state_pop
4              Pacific  California     109008.0         20964.0   39461588
9       South Atlantic     Florida      21443.0          9587.0   21244317
-----
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"]== "Mountain"]
# See the result
print(mountain_reg)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
5   Mountain    Colorado       7607.0          3250.0    5691287
12  Mountain       Idaho       1297.0           715.0    1750536
------
# Filter for rows where family_members is less than 1000 and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"] < 1000)& (homelessness["region"] == "Pacific")]
# See the result
print(fam_lt_1k_pac)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska       1434.0           582.0     735139
------
# Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic = homelessness[(homelessness["region"] == "South Atlantic")| (homelessness["region"] == "Mid-Atlantic")]
# See the result
print(south_mid_atlantic)

(south_mid_atlantic)
            region                 state  individuals  family_members  state_pop
7   South Atlantic              Delaware        708.0           374.0     965479
30    Mid-Atlantic            New Jersey       6048.0          3350.0    8886025
-----
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]
# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]
# See the result
print(mojave_homelessness)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
4    Pacific  California     109008.0         20964.0   39461588

NEW COLUMN

dogs["height_m"] = dogs["height_cm"] / 100
print(dogs)

dogs["bmi"] = dogs["weight_kg"] / dogs["height_m"] ** 2
print(dogs.head())`
# Add total col as sum of individuals and family_members
homelessness["total"] = homelessness["individuals"] + homelessness["family_members"]
# Add p_individuals col as proportion of total that are individuals
homelessness["p_individuals"]= homelessness["individuals"]/ homelessness["total"]
# See the result
print(homelessness)

                region                 state  individuals  family_members  state_pop  p_individuals     total
0   East South Central               Alabama       2570.0           864.0    4887681          0.748    3434.0
1              Pacific                Alaska       1434.0           582.0     735139          0.711    2016.0
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness["individuals"] /homelessness["state_pop"] 
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"] > 20]
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values(["indiv_per_10k"], ascending=False)
# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]
# See the result
print(result)
                       state  indiv_per_10k
    8   District of Columbia         53.738
    11                Hawaii         29.079
    4             California         27.624
    37                Oregon         26.636

S U M M A R Y S T A D I S T I C S

.median()

.mode()

.min()

.max()

.var()

.std()

.sum()

.quantile()

.cumsum() suma acumulativa

.cummax() maximo acumulativo

.cummin() minimo acumulativo

.cumprod() producto acumulativo


The .agg() method

def pct30(column): return column.quantile(0.30)

  • Esto toma toda una columna y da el percentil 30 de la columna

dogs["weight_kg"].agg(pct30) 22.599999999999998

dogs[["weight_kg", "height_cm"]].agg(pct30)

weight_kg 22.6 height_cm 45.4 dtype: float64

** dogs["weight_kg"].agg([pct30,** pct40]) pct30 22.6 pct40 24.0


 # Print the head of the sales DataFrame
print(sales.head())
# Print the info about the sales DataFrame
print(sales.info())
# Print the mean of weekly_sales
print(sales["weekly_sales"].mean())
# Print the median of weekly_sales
print(sales["weekly_sales"].median())

 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   store                 10774 non-null  int64         
 1   type                  10774 non-null  object        
 2   department            10774 non-null  int32
dtypes: bool(1), datetime64[ns](1), float64(4), int32(1), int64(1), object(1)
None
23843.95014850566
12049.064999999999
-------
# Print the maximum of the date column
print(sales["date"].max())
2012-10-26 00:00:00
# Print the minimum of the date column
print(sales["date"].min())
2010-02-05 00:00:00

I Q R