Welcome to the world of e-commerce, where customer feedback is a goldmine of insights! In this project, you'll dive into the Women's Clothing E-Commerce Reviews dataset, focusing on the 'Review Text' column filled with direct customer opinions.
Your mission is to use text embeddings and Python to analyze these reviews, uncover underlying themes, and understand customer sentiments. This analysis will help improve customer service and product offerings.
The Data
You will be working with a dataset specifically focusing on customer reviews. Below is the data dictionary for the relevant field:
womens_clothing_e-commerce_reviews.csv
| Column | Description |
|---|---|
'Review Text' | Textual feedback provided by customers about their shopping experience and product quality. |
Armed with access to powerful embedding API services, you will process the reviews, extract meaningful insights, and present your findings.
Let's get started!
Install useful libraries
# Run this cell to install ChromaDB if desired
try:
assert version('chromadb') == '0.4.17'
except:
!pip install chromadb==0.4.17
try:
assert version('pysqlite3') == '0.5.2'
except:
!pip install pysqlite3-binary==0.5.2
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadbLoad the dataset
Load data and perform basic data checks to ensure you are using relevant data for the analysis
# Install needed packages and Load the dataset
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.manifold import TSNE
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")
# Display the first few entries
reviews.head()
reviews.dropna(subset=['Review Text'],inplace=True)review_texts = [rt for rt in reviews['Review Text'].str.replace("\n"," ")]review_texts[0]#create an openai connection. label it as we will also use chromadb
client_oai = OpenAI()
# If review_texts is a pandas Series, convert it to a list
if isinstance(review_texts, pd.Series):
review_texts = review_texts.tolist()
# Remove NaN, None, and ensure all elements are strings
cleaned_texts = []
for t in review_texts:
if isinstance(t, str) and t.strip() != "":
cleaned_texts.append(t)
elif isinstance(t, (float, int)) and not pd.isnull(t):
# Convert numbers to strings if needed
cleaned_texts.append(str(t))
# Optionally, you can skip non-string and non-numeric types entirely
response = client_oai.embeddings.create(model='text-embedding-3-small', input=cleaned_texts)
embeddings =[x.embedding for x in response.data]len(embeddings) > 900#ensure perplexity is smaller than the number of embeddings
perplexity = min(30,len(embeddings)//2)
perplexity
tsne =TSNE(n_components=2, perplexity=perplexity)
embeddings_2d = np.array(tsne.fit_transform(np.array(embeddings)))import matplotlib.pyplot as plt
import seaborn as sns
embeddings_2d = np.array(embeddings_2d)
plt.scatter(x=embeddings_2d[:,0],y=embeddings_2d[:,1])
plt.show()
# Define a create_embeddings function
from scipy.spatial import distance
# --- FIX: Add batching to avoid BadRequestError for too many inputs ---
def create_embeddings(texts, batch_size=16):
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
response = client_oai.embeddings.create(
model="text-embedding-3-small",
input=batch
)
response_dict = response.model_dump()
all_embeddings.extend([data['embedding'] for data in response_dict['data']])
return all_embeddings
feedback_topics = [
{"label": "fit", "description": "review speaks about how the garment fit."},
{"label": "quality", "description": "review speaks about the quality of the garment."},
{"label": "style", "description": "review speaks about the style of the garment"},
{"label": "comfort", "description": "review speaks about the comfort of the garment."}
]
# Create a list of class descriptions from the feedback labels
feedback_descriptions = [feedback_topic['description'] for feedback_topic in feedback_topics]
feedback_descriptions_embeddings = create_embeddings(feedback_descriptions)
def find_closest(query_vector, embedding_map):
distances = []
for index, embedding in enumerate(embedding_map):
dist = distance.cosine(query_vector, embedding)
distances.append({"distance": dist, "index": index})
return min(distances, key=lambda x: x["distance"])
##Uncomment below to run the classifier
# for index, review in enumerate(review_texts):
# # Find the closest distance and its index using find_closest()
# closest = find_closest(embeddings[index], feedback_descriptions_embeddings)
# # Subset sentiments using the index from closest
# label = feedback_topics[closest['index']]['label']
# print(f'"{index}" was classified as {label}')