Skip to content

Welcome to the world of e-commerce, where customer feedback is a goldmine of insights! In this project, you'll dive into the Women's Clothing E-Commerce Reviews dataset, focusing on the 'Review Text' column filled with direct customer opinions.

Your mission is to use text embeddings and Python to analyze these reviews, uncover underlying themes, and understand customer sentiments. This analysis will help improve customer service and product offerings.

The Data

You will be working with a dataset specifically focusing on customer reviews. Below is the data dictionary for the relevant field:

womens_clothing_e-commerce_reviews.csv

ColumnDescription
'Review Text'Textual feedback provided by customers about their shopping experience and product quality.

Armed with access to powerful embedding API services, you will process the reviews, extract meaningful insights, and present your findings.

Let's get started!

Install useful libraries

# Run this cell to install ChromaDB if desired
try:
    assert version('chromadb') == '0.4.17'
except:
    !pip install chromadb==0.4.17
try:
    assert version('pysqlite3') == '0.5.2'
except:
    !pip install pysqlite3-binary==0.5.2
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb
Hidden output

Load the dataset

Load data and perform basic data checks to ensure you are using relevant data for the analysis

#embedding the reviews and plotting them the vectors in a 2-D graph

import pandas as pd
import openai
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAIError
import os
from dotenv import load_dotenv

# Authentication
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    api_key = input("Enter your OpenAI API key: ").strip()
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in .env and no key provided!")
try:
    openai_client = openai.OpenAI(api_key=api_key)
    openai_client.models.list()  # Validate key
except openai.AuthenticationError:
    raise ValueError("Invalid OpenAI API key!")

reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")
reviews = reviews.dropna(subset=['Review Text'])

# selecting the review text column 
review_text = reviews['Review Text'].tolist()  

# Defining an embeddings function
def embed(text):
    response = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return [data.embedding for data in response.data]

# create the embeddings for the reviews text 
embeddings = embed(review_text)

# Converting embeddings to NumPy array
embeddings_array = np.array(embeddings)

# Reducing to 2D using t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=300)
embeddings_2d = tsne.fit_transform(embeddings_array)
print(f"Reduced to 2D: {embeddings_2d.shape}")  # Should be (958, 2)

# Creating scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    embeddings_2d[:, 0], 
    embeddings_2d[:, 1], 
    c=reviews['Rating'],  # Color by rating
    cmap='viridis',  # Nice color gradient
    s=50,  # Point size
    alpha=0.6  # Slight transparency for overlaps
)
plt.colorbar(label='Rating (1-5)')
plt.title('2D t-SNE of Women\'s Clothing Review Embeddings', fontsize=14)
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)

# Annotating a few points (Limit to 10)
for i in range(min(10, len(review_text))):  
    plt.annotate(
        f"ID{i}", 
        (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
        fontsize=8, 
        alpha=0.7, 
        xytext=(5, 5), 
        textcoords='offset points'
    )

plt.tight_layout()
plt.show()
# Storing the vector embeddings in a vector database (ChromaDB) and running similarity search against the refined query topics

import chromadb
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAIError
import os
from dotenv import load_dotenv

#  Authentication
client = openai.OpenAI()

#ChromaDB setup 
chroma_client = chromadb.Client()
try:
    collection = chroma_client.get_collection("reviews")
except:
    collection = chroma_client.create_collection("reviews")
    collection.add(
        embeddings=embeddings,
        documents=review_text,
        ids=[str(i) for i in range(len(review_text))]
    )
print(f"Collection 'reviews' has {collection.count()} entries.")

# Listing refined topic queries
topics = ["high quality material", "poor fit sizing", "stylish fashionable design", "comfortable wear"]
topic_results = {}

# Querying top 3 reviews per topic
for topic in topics:
    try:
        response = client.embeddings.create(model="text-embedding-3-small", input=topic)
        topic_embedding = response.data[0].embedding
        results = collection.query(query_embeddings=[topic_embedding], n_results=3, include=["documents", "distances", "metadatas"])
        topic_results[topic] = [{"id": results["ids"][0][i], "text": results["documents"][0][i], "distance": results["distances"][0][i]} for i in range(len(results["documents"][0]))]
    except OpenAIError as e:
        print(f"Error embedding '{topic}': {e}")
        topic_results[topic] = []

# Print results
for topic, results in topic_results.items():
    print(f"\nReviews about '{topic}':")
    for i, res in enumerate(results):
        review_snippet = res["text"][:50] + ("..." if len(res["text"]) > 50 else "")
        print(f"{i+1}. (ID: {res['id']}, Distance: {res['distance']:.3f}) {review_snippet}")
Run cancelled
#Similarity search function that outputs the closest 3 reviews to a given input review for a more personalized customer service response

import chromadb
import openai
import os
from dotenv import load_dotenv

# Authentication 
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    api_key = input("Enter your OpenAI API key: ").strip()
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in .env and no key provided!")
try:
    openai_client = openai.OpenAI(api_key=api_key)
    openai_client.models.list()  # Validate key
except openai.AuthenticationError:
    raise ValueError("Invalid OpenAI API key!")

# Initialize ChromaDB (re-use existing collection with 958 reviews)
chroma_client = chromadb.Client()
collection = chroma_client.get_collection("reviews")

# similarity search function to return a list
def find_similar_reviews(input_review: str, n_results: int = 3) -> list[str]:
    """
    Find the top N most similar reviews to the input review using ChromaDB.
    Args:
        input_review (str): The input review to match.
        n_results (int): Number of similar reviews to return (default: 3).
    Returns:
        list[str]: List of the top N similar review texts.
    """
    # Embedding the input review
    try:
        response = openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=[input_review]
        )
        query_embedding = response.data[0].embedding
    except Exception as e:
        print(f"Error embedding input review: {e}")
        return []

    # Querying ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results,
        include=["documents"]
    )

    # Returning a list of review texts
    return results["documents"][0]

# Test : Applying to the first review
test_review = "Absolutely wonderful - silky and sexy and comfortable"
most_similar_reviews = find_similar_reviews(first_review, n_results=3)

# Print for verification
print(f"Input Review: {test_review}")
print(f"Top 3 Similar Reviews:")
for i, review in enumerate(most_similar_reviews):
    print(f"Review {i+1}: {review[:300]}..." if len(review) > 300 else f"Review {i+1}: {review}")