Skip to content

Welcome to the world of e-commerce, where customer feedback is a goldmine of insights! In this project, you'll dive into the Women's Clothing E-Commerce Reviews dataset, focusing on the 'Review Text' column filled with direct customer opinions.

Your mission is to use text embeddings and Python to analyze these reviews, uncover underlying themes, and understand customer sentiments. This analysis will help improve customer service and product offerings.

The Data

You will be working with a dataset specifically focusing on customer reviews. Below is the data dictionary for the relevant field:

womens_clothing_e-commerce_reviews.csv

ColumnDescription
'Review Text'Textual feedback provided by customers about their shopping experience and product quality.

Armed with access to powerful embedding API services, you will process the reviews, extract meaningful insights, and present your findings.

Let's get started!

Install useful libraries

# Run this cell to install ChromaDB if desired
try:
    assert version('chromadb') == '0.4.17'
except:
    !pip install chromadb==0.4.17
try:
    assert version('pysqlite3') == '0.5.2'
except:
    !pip install pysqlite3-binary==0.5.2
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

import matplotlib.pyplot as plt
from openai import OpenAI
from sklearn.manifold import TSNE
from scipy.spatial import distance
import numpy as np
import json
import os
Hidden output

Load the dataset

Load data and perform basic data checks to ensure you are using relevant data for the analysis

# Load the dataset
import pandas as pd
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")

# Display the first few entries
reviews.head()
reviews.info()
# Start coding here
# Use as many cells as you need.

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')

client = OpenAI(api_key=OPENAI_API_KEY)

review_texts = [str(review) for review in list(reviews['Review Text'])]

def create_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    response_dict = response.model_dump()
    return [data['embedding'] for data in response_dict['data']]

embeddings = create_embeddings(review_texts)
tsne = TSNE(n_components=2, perplexity=5)
embeddings_2d = tsne.fit_transform(np.array(embeddings))

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

plt.show()
topic_texts = ['quality', 'fit', 'style', 'comfort']
topic_embeddings = create_embeddings(topic_texts)
def find_n_closest(query_vector, embeddings, n=3):
    distances = []
    for index, embedding in enumerate(embeddings):
        dist = distance.cosine(query_vector, embedding)
        distances.append({"distance": dist, "index": index})
    distances_sorted = sorted(distances, key=lambda x: x["distance"])
    return distances_sorted
for i in range(5):
    review_text = review_texts[i]
    review_embedded = embeddings[i]
    review_topic = find_n_closest(review_embedded, topic_embeddings, n=1)
    topic = topic_texts[review_topic[0]["index"]]
    print(f"Reviews: {review_text}, Topic: {topic}")
client = chromadb.PersistentClient()

collection = client.get_or_create_collection(
    name="clothing_reviews",
    embedding_function=OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small",
        api_key=OPENAI_API_KEY
    )
)

print(client.list_collections())
ids = ["s"+str(review_id) for review_id in list(reviews["Review ID"])]
def create_chromadb_document(review):
    clothing_id = f"Clothing ID: {review['Clothing ID']}"
    age = f"Age: {review['Age']}"
    title = f"Title: {review['Title']}"
    review_text = f"Review Text: {review['Review Text']}"
    rating = f"Rating: {review['Rating']}"
    recommended_ind = f"Recommended IND: {review['Recommended IND']}"
    positive_feedback_count = f"Positive Feedback Count: {review['Positive Feedback Count']}"
    division_name = f"Division Name: {review['Division Name']}"
    department_name = f"Department Name: {review['Department Name']}"
    class_name = f"Class Name: {review['Class Name']}"
    return "\n".join([clothing_id, age, title, review_text, rating, recommended_ind, positive_feedback_count, division_name, department_name, class_name])
documents = []
for index, review in reviews.iterrows():
    documents.append(f"Title: {review['Title']}\nReview Text: {review['Review Text']}")

collection.add(
    ids=ids,
    documents=documents
)
def closest_reviews(query_vector, n):
    return collection.query(
        query_texts=[query_vector],
        n_results=n
    )