Course Notes: Working with Categorical Data in Python

Intro

# Ordinal and Nominal

# Create categorical series
# 1
pd.Series(df, dtype = 'category')
# 2
pd.Categorical(df, categories = ["", "", ""], ordered = True)

Categorical Pandas Series

# set categories
df["col1"] = df["col1"].cat.set_categories(new_categories = ["shot", "medium", "long"], ordered = True)
# add categories
df["col1"] = df["col1"].cat.add_categories(new_categories = ["did not check", "could not tell"])
# remove categories
df["col1"] = df["col1"].cat.remove_categories(removals = ["wirehaired"])
# check categories
df["col1"].cat.categories

# rename: must use new name; cannot collapse two into one
dict = {"unknown mix": "unknown"}
df["col1"] = df["col1"].cat.rename_categories(dict)

df["col1"] = df["col1"].cat.rename_categories(lambda c: c.title()) # convert into title case

# Collapse
updated_colors = {
    "black and brown": "black",
    "black and tan": "black",
    "black and white": "black"
}
df["main_color"] = df["color"].replace(updated_colors)
# the type of column "main_color" is object not category

# reorder
df["col1"] = df["col1"].cat.reorder_categories(new_categories = [], ordered = True/False)
-->
df["col1"].cat.reorder_categories(new_categories = [], ordered = True/False, inplace = True)

# fix issues
# whitespace
df['col1'] = df['col1'].str.strip()

# capitalization
df["col1"] = df["col1"].str.title() # .upper()/ .lower()



df["col1"].str.contains("", regex = False)

df.loc[df[""]=="", ""]

Visualization

sns.catplot(data=, x=, y=, kind = )
# kind = "strip"/"swarm"/"box"/"violin"/"boxen"/"point"/"bar"/"count"

# set front size and plot background
sns.set(font_scale=1.4)
sns.set_style("whitegrid")

sns.catplot(kind = "point", dogde = , joint = )

sns.catplot(x=, data=, kind = "count", hue=)

sns.catplot(x=, data=, kind=, col="", col_wrap=, palette=sns.color_palette("Set1"))  # "Set"/"Set2"/"Tab10"/"Paired"


ax = sns.catplot()
ax.fig.suptitle("")
ax.set_axis_labels("")
plt.subplots_adjust(top = .9)
plt.show()

Encoding

codes = df["name"].cat.codes
categories = df["name"]
name_map = dict(zip(codes, categories))

df["codes"] = df["name"].cat.codes
# reverting to previous values
df["codes"].map(name_map)

# boolean coding
used_cars["van_code"] = np.where(
    used_cars["body_type"].str.contains("van", regex = False), 1, 0
)

# one-hot encoding
pd.get_dummies(data, columns = [], prefix = "")