Project: Topic Analysis of Clothing Reviews with Embeddings

Welcome to the world of e-commerce, where customer feedback is a goldmine of insights! In this project, you'll dive into the Women's Clothing E-Commerce Reviews dataset, focusing on the 'Review Text' column filled with direct customer opinions.

Your mission is to use text embeddings and Python to analyze these reviews, uncover underlying themes, and understand customer sentiments. This analysis will help improve customer service and product offerings.

The Data

You will be working with a dataset specifically focusing on customer reviews. Below is the data dictionary for the relevant field:

womens_clothing_e-commerce_reviews.csv

Column	Description
`'Review Text'`	Textual feedback provided by customers about their shopping experience and product quality.

Armed with access to powerful embedding API services, you will process the reviews, extract meaningful insights, and present your findings.

Let's get started!

Install useful libraries

# Run this cell to install ChromaDB if desired
try:
    assert version('chromadb') == '0.4.17'
except:
    !pip install chromadb==0.4.17
try:
    assert version('pysqlite3') == '0.5.2'
except:
    !pip install pysqlite3-binary==0.5.2
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb

Hidden output

Load the dataset

Load data and perform basic data checks to ensure you are using relevant data for the analysis

# Load the dataset
import pandas as pd
reviews = pd.read_csv("womens_clothing_e-commerce_reviews.csv")

# Display the first few entries
reviews.head()

# Start coding here
# Use as many cells as you need.
import os
import openai
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scipy.spatial import distance
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

EMBEDDING_MODEL = "text-embedding-3-small"

#Get review text
review_texts = reviews['Review Text'].dropna()

#Create and Store the embeddings for reviews
client = openai.OpenAI()
responses = client.embeddings.create(
    input=review_texts.tolist(),
    model=EMBEDDING_MODEL
).model_dump()
embeddings = [response['embedding'] for response in responses['data']]

#Apply t-SNE for dimensionality reduction
def apply_tsne(embeddings):
    tsne = TSNE(n_components=2, random_state=0)
    return tsne.fit_transform(embeddings)

embeddings_2d = apply_tsne(np.array(embeddings))

# Plotting the results of the t-SNE
def plot_tsne(tsne_results):
    plt.figure(figsize=(12, 8))
    for i, point in enumerate(tsne_results):
        plt.scatter(point[0], point[1], alpha=0.5)
        plt.text(point[0], point[1], str(i), fontsize=8, verticalalignment='center')
    plt.title("t-SNE Visualisation of Review Embeddings")
    plt.xlabel("t-SNE feature 1")
    plt.ylabel("t-SNE feature 2")
    plt.show()

plot_tsne(embeddings_2d)

#Feedback categorisation

#Topics
categories = ['Quality', 'Fit', 'Style', 'Comfort']

#Create category embeddings
category_responses = client.embeddings.create(
    input=categories,
    model=EMBEDDING_MODEL
).model_dump()

category_embeddings = [embedding['embedding'] for embedding in category_responses['data']]

def categorize_feedback(text_embedding, category_embeddings, categories):
    similarities = [{"distance": distance.cosine(text_embedding, cat_emb), "index": i} 
                   for i, cat_emb in enumerate(category_embeddings)]
    closest = min(similarities, key=lambda x: x["distance"])
    return categories[closest["index"]]

# categorise feedback
feedback_categories = [categorize_feedback(embedding, category_embeddings, categories) 
                      for embedding in embeddings]
#print(feedback_categories)

#Vector Storage

client = chromadb.PersistentClient()

# define vector database
review_embeddings_db = client.create_collection(
    name="review_embeddings",
    embedding_function=OpenAIEmbeddingFunction(
        model_name="text-embedding-3-small", 
        api_key=os.environ["OPENAI_API_KEY"]
    )
)

# store embeddings
review_embeddings_db.add(
    documents=review_texts.tolist(),
    ids=[str(i) for i in range(len(review_texts))]
)

# function for similarity search
def find_similar_reviews(input_text, collection_name, n):
    collection = client.get_collection(
        name=collection_name,
        embedding_function=OpenAIEmbeddingFunction(
            model_name="text-embedding-3-small", 
            api_key=os.environ["OPENAI_API_KEY"]
        )
    )
    results = collection.query(
        query_texts=[input_text],
        n_results=n
    )
    return results

# example usage
example_review = "Absolutely wonderful - silky and sexy and comfortable"
most_similar_reviews = find_similar_reviews(example_review, "review_embeddings", 3)["documents"][0]
print(most_similar_reviews)

# clean up
client.delete_collection(name="review_embeddings")

Project: Topic Analysis of Clothing Reviews with Embeddings

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}The Data

womens_clothing_e-commerce_reviews.csv

Install useful libraries

Load the dataset

The Data