Skip to content
Project: What Makes a Good Book?
Identifying popular products is incredibly important for e-commerce companies! Popular products generate more revenue and, therefore, play a key role in stock control.
You've been asked to support an online bookstore by building a model to predict whether a book will be popular or not. They've supplied you with an extensive dataset containing information about all books they've sold, including:
pricepopularity(target variable)review/summaryreview/textreview/helpfulnessauthorscategories
You'll need to build a model that predicts whether a book will be rated as popular or not.
They have high expectations of you, so have set a target of at least 70% accuracy! You are free to use as many features as you like, and will need to engineer new features to achieve this level of performance.
# Import some required packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# Read in the dataset
books = pd.read_csv("data/books.csv")
# Preview the first five rows
books.head(10)# Confirm that popularity column has two unique values
books['popularity'].nunique()# Inspect categories column
books['categories'].value_counts()# Correct case inconsistencies
books['categories'] = books['categories'].str.title()
# Remove underrepresented categories to avoid overfitting
books = books.groupby('categories').filter(lambda x: len(x) > 100)
# Check categories
books['categories'].value_counts()# Encode categories column
books = pd.get_dummies(books, columns=['categories'], drop_first=True)# Create review and helpfulness columns
books['num_reviews'] = books['review/helpfulness'].str.split('/', expand=True)[0].astype(int)
books['num_helpfulness'] = books['review/helpfulness'].str.split('/', expand=True)[1].astype(int)
# Create percentage column by dividing num_reviews by num_helpfulness
books['pct_helpful_review'] = books['num_reviews'] / books['num_helpfulness']
# Fill missing values with 0
books['pct_helpful_review'].fillna(0, inplace=True)
# Confirm no missing values
books['pct_helpful_review'].isna().sum()# Create list of positive words
positive_words = ['good', 'great', 'helpful', 'love', 'interesting', 'amazing', 'wonderful', 'excellent', 'memorable', 'engaging', 'thrilling']
# Instantiate vectorizer object
vectorizer = CountVectorizer(vocabulary=positive_words, stop_words='english')
# Fit and transform columns
review_text = vectorizer.fit_transform(books['review/text'].fillna(''))
review_summary = vectorizer.fit_transform(books['review/summary'].fillna(''))
description = vectorizer.fit_transform(books['description'].fillna(''))
# Add positive counts into dataframe
books['positive_text'] = review_text.sum(axis=1).reshape(-1, 1)
books['positive_summary'] = review_summary.sum(axis=1).reshape(-1, 1)
books['positive_description'] = description.sum(axis=1).reshape(-1, 1)
# Remove unnecessary columns
cols_to_drop = ['title', 'authors', 'review/text', 'review/summary', 'description', 'review/helpfulness']
books.drop(columns=cols_to_drop, axis=1, inplace=True)
# Inspect dataframe
books.head()# Prepare data for machine learning
# Define feature and target variables
y = books['popularity'].values.reshape(-1, 1)
X = books.drop('popularity', axis=1)
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)
# Instantiate RandomForestClassifier object
rf = RandomForestClassifier(max_features='log2', random_state=10)
# Fit training data
rf.fit(X_train, y_train)
# Check test score
model_accuracy = rf.score(X_test, y_test)
model_accuracy