Skip to content
Course Notes: Working with Categorical Data in Python
Intro
# Ordinal and Nominal
# Create categorical series
# 1
pd.Series(df, dtype = 'category')
# 2
pd.Categorical(df, categories = ["", "", ""], ordered = True)
Categorical Pandas Series
# set categories
df["col1"] = df["col1"].cat.set_categories(new_categories = ["shot", "medium", "long"], ordered = True)
# add categories
df["col1"] = df["col1"].cat.add_categories(new_categories = ["did not check", "could not tell"])
# remove categories
df["col1"] = df["col1"].cat.remove_categories(removals = ["wirehaired"])
# check categories
df["col1"].cat.categories
# rename: must use new name; cannot collapse two into one
dict = {"unknown mix": "unknown"}
df["col1"] = df["col1"].cat.rename_categories(dict)
df["col1"] = df["col1"].cat.rename_categories(lambda c: c.title()) # convert into title case
# Collapse
updated_colors = {
"black and brown": "black",
"black and tan": "black",
"black and white": "black"
}
df["main_color"] = df["color"].replace(updated_colors)
# the type of column "main_color" is object not category
# reorder
df["col1"] = df["col1"].cat.reorder_categories(new_categories = [], ordered = True/False)
-->
df["col1"].cat.reorder_categories(new_categories = [], ordered = True/False, inplace = True)
# fix issues
# whitespace
df['col1'] = df['col1'].str.strip()
# capitalization
df["col1"] = df["col1"].str.title() # .upper()/ .lower()
df["col1"].str.contains("", regex = False)
df.loc[df[""]=="", ""]
Visualization
sns.catplot(data=, x=, y=, kind = )
# kind = "strip"/"swarm"/"box"/"violin"/"boxen"/"point"/"bar"/"count"
# set front size and plot background
sns.set(font_scale=1.4)
sns.set_style("whitegrid")
sns.catplot(kind = "point", dogde = , joint = )
sns.catplot(x=, data=, kind = "count", hue=)
sns.catplot(x=, data=, kind=, col="", col_wrap=, palette=sns.color_palette("Set1")) # "Set"/"Set2"/"Tab10"/"Paired"
ax = sns.catplot()
ax.fig.suptitle("")
ax.set_axis_labels("")
plt.subplots_adjust(top = .9)
plt.show()
Encoding
codes = df["name"].cat.codes
categories = df["name"]
name_map = dict(zip(codes, categories))
df["codes"] = df["name"].cat.codes
# reverting to previous values
df["codes"].map(name_map)
# boolean coding
used_cars["van_code"] = np.where(
used_cars["body_type"].str.contains("van", regex = False), 1, 0
)
# one-hot encoding
pd.get_dummies(data, columns = [], prefix = "")