Skip to content
Project: What Makes a Good Book?
Identifying popular products is incredibly important for e-commerce companies! Popular products generate more revenue and, therefore, play a key role in stock control.
You've been asked to support an online bookstore by building a model to predict whether a book will be popular or not. They've supplied you with an extensive dataset containing information about all books they've sold, including:
price
popularity
(target variable)review/summary
review/text
review/helpfulness
authors
categories
You'll need to build a model that predicts whether a book will be rated as popular or not.
They have high expectations of you, so have set a target of at least 70% accuracy! You are free to use as many features as you like, and will need to engineer new features to achieve this level of performance.
# Import some required packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Read in the dataset
books = pd.read_csv("data/books.csv")
# Inspect the DataFrame
books.info()
# Visualize popularity frequencies
sns.countplot(data=books, x="popularity")
plt.show()
# Check frequencies
print(books["categories"].value_counts())
# Filter out rare categories to avoid overfitting
books = books.groupby("categories").filter(lambda x: len(x) > 100)
# One-hot encoding categories
categories = pd.get_dummies(books["categories"], drop_first=True)
# Bring categories into the DataFrame
books = pd.concat([books, categories], axis=1)
# Remove original column
books.drop(columns=["categories"], inplace=True)
# Get number of total reviews
books["num_reviews"] = books["review/helpfulness"].str.split("/", expand=True)[1]
# Get number of helpful reviews
books["num_helpful"] = books["review/helpfulness"].str.split("/", expand=True)[0]
# Convert to integer datatype
for col in ["num_reviews", "num_helpful"]:
books[col] = books[col].astype(int)
# Add percentage of helpful reviews as a column to normalize the data
books["perc_helpful_reviews"] = books["num_helpful"] / books["num_reviews"]
# Fill null values
books["perc_helpful_reviews"].fillna(0, inplace=True)
# Drop original column
books.drop(columns=["review/helpfulness"], inplace=True)
# Convert strings to lowercase
for col in ["review/summary", "review/text", "description"]:
books[col] = books[col].str.lower()
# Create a list of positive words to measure positive text sentiment
positive_words = ["great", "excellent", "good", "interesting", "enjoy", "helpful", "useful", "like", "love", "beautiful", "fantastic", "perfect", "wonderful", "impressive", "amazing", "outstanding", "remarkable", "brilliant", "exceptional", "positive",
"thrilling"]
# Instantiate a CountVectorizer
vectorizer = CountVectorizer(vocabulary=positive_words)
# Fit and transform review/text
review_text = books["review/text"]
text_transformed = vectorizer.fit_transform(review_text.fillna(''))
# Fit and transform review/summary
review_summary = books["review/summary"]
summary_transformed = vectorizer.fit_transform(review_summary.fillna(''))
# Fit and transform description
description = books["description"]
description_transformed = vectorizer.fit_transform(description.fillna(''))
# Add positive counts into DataFrame to add measures of positive sentiment
books["positive_words_text"] = text_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_summary"] = summary_transformed.sum(axis=1).reshape(-1, 1)
books["positive_words_description"] = description_transformed.sum(axis=1).reshape(-1, 1)
# Remove original columns
books.drop(columns=["review/text", "review/summary", "description"], inplace=True)
# Splitting into features and target values
X = books.drop(columns=["title", "authors", "popularity"]).values
y = books["popularity"].values.reshape(-1, 1)
# Splitting into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Instantiate and fit a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=120, max_depth=50, min_samples_split=5, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train.ravel())
# Evaluate accuracy
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
model_accuracy = clf.score(X_test, y_test)