Skip to content
Data Manipulation with pandas
Data Manipulation with pandas
Run the hidden code cell below to import the data used in this course.
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv")
Take Notes
Add notes about the concepts you've learned and code cells with code you want to keep.
Chapter one
# To view the first few rows in a dataset
import pandas as pd
df = pd.read_csv("csv_file.csv")
df.head()
# To see the shape of the dataset
df.shape
# To see the columns found in your dataset
df.columns
# To see a summary of the dataset
df.info()
# To display a statistical view of the dataset
df.describe()
# To see a 2D array of the data in your dataset
df.values
# To see the index range of your dataset
df.index
# To sort values based on a particular column or columns
df.sort_values("column_name", ascending=False)
df.sort_values(["column_one", "column_two"], ascending=[True, False])
# To select a particular column in a dataset
df.column_one
df["column_one"]
# To select multiple columns in a dataset
df[["column_one","column_two"]]
# To filter rows in the dataset
df[df["column_one"] > 450]
df[df["column_two"] == "red"]
df[(df["column_one"] > 450) & (df["column_two"] == "red")]
# Subsetting rows by categorical variables
df["column_one"].isin(["red", "blue"])
# Adding new columns / mutating / feature engineering / transforming
df["new_column"] = df["column_two"] + df["column_one"]
Chapter Two: Summary Statistics
# Mean and Median
df["column_one"].mean()
df["column_two"].median()
# Max and Min
df.column_two.max()
df.column_two.min()