Welcome to the world of e-commerce, where customer feedback is a goldmine of insights! In this project, you'll dive into the Women's Clothing E-Commerce Reviews dataset, focusing on the 'Review Text' column filled with direct customer opinions.
Your mission is to use text embeddings and Python to analyze these reviews, uncover underlying themes, and understand customer sentiments. This analysis will help improve customer service and product offerings.
The Data
You will be working with a dataset specifically focusing on customer reviews. Below is the data dictionary for the relevant field:
womens_clothing_e-commerce_reviews.csv
| Column | Description |
|---|---|
'Review Text' | Textual feedback provided by customers about their shopping experience and product quality. |
Armed with access to powerful embedding API services, you will process the reviews, extract meaningful insights, and present your findings.
Let's get started!
Install useful libraries
# Run this cell to install ChromaDB if desired
try:
assert version('chromadb') == '0.4.17'
except:
!pip install chromadb==0.4.17
try:
assert version('pysqlite3') == '0.5.2'
except:
!pip install pysqlite3-binary==0.5.2
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadbLoad the dataset
Load data and perform basic data checks to ensure you are using relevant data for the analysis
# Load the dataset
import pandas as pd
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")
# Display the first few entries
reviews.head(10)# Start coding here
# Use as many cells as you need.
import pandas as pd
from openai import OpenAI
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance
import os
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")
review_text = [review for review in reviews['Review Text'] if pd.notnull(review)]
review_ids = ['id_'+ str(id) for id, review in enumerate(review_text)]
client = OpenAI()
#Define funditon to create the embeddings
def create_embeddings(text):
responses = client.embeddings.create(
model="text-embedding-3-small",
input= text)
responses_dict = responses.model_dump()
return [response['embedding'] for response in responses_dict['data']]
embeddings = create_embeddings(review_text)
# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, perplexity=5)
embeddings_2d = tsne.fit_transform(np.array(embeddings))
#Plot 2D visual representation of the reviews
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
plt.title("t-SNE Visualization of Review Embeddings")
plt.xlabel("t-SNE feature 1")
plt.ylabel("t-SNE feature 2")
for i, embedding in enumerate(embeddings_2d):
plt.annotate(str(i), (embeddings_2d[i, 0], embeddings_2d[i, 1]))
plt.show()
#Feedback categorization
#Define function to find closest embedding to query
def find_closest(query_vector, embeddings):
distances = []
for index, embedding in enumerate(embeddings):
dist = distance.cosine(query_vector, embedding)
distances.append({"distance": dist, "index": index})
return min(distances, key=lambda x: x["distance"])
#Define topics
topics = [
{'label': 'Quality'},
{'label': 'Fit'},
{'label': 'Style'},
{'label': 'Comfort'}
]
class_descriptions = [topic['label'] for topic in topics]
class_embeddings = create_embeddings(class_descriptions)
review_topics= []
for index, review_embedding in enumerate (review_embeddings):
closest = find_closest(review_embedding, class_embeddings)
label = topics[closest['index']]['label']
review_topics.append({'review':review_text[index],'topic':label})
#print (review_topics)
# Use chromadb to create embeddings
# Create reviews_collection on chromadb
client = chromadb.PersistentClient()
#client.delete_collection("reviews_collection")
collection = client.create_collection(
name="reviews_collection",
embedding_function= OpenAIEmbeddingFunction(
model_name="text-embedding-3-small",
api_key= os.environ["OPENAI_API_KEY"]))
#add the review_text and reviews_ids to the collection
collection.add(ids= review_ids,documents= review_text)
#print('collection count:',collection.count())
#print(collection.get(ids=['id_5']))
#define a function that returns the n_closest reviews
def n_closest_reviews(query,num_results):
return collection.query(query_texts= query,n_results=num_results)
result = n_closest_reviews(review_text[0],3)
most_similar_reviews = result['documents'][0]
print(most_similar_reviews)
#remove the collection after every execution
client.delete_collection("reviews_collection")