Project: Analyzing Customer Support Calls

A retail company is on a transformative journey, aiming to elevate their customer services through cutting-edge advancements in Speech Recognition and Natural Language Processing (NLP). As the machine learning engineer for this initiative, you are tasked with developing functionalities that not only convert customer support audio calls into text but also explore methodologies to extract insights from transcribed texts.

In this dynamic project, we leverage the power of SpeechRecognition, Pydub, and spaCy – three open-source packages that form the backbone of your solution. Your objectives are:

Transcribe a sample customer audio call, stored at sample_customer_call.wav, to showcase the power of open-source speech recognition technology.
Analyze sentiment, identify common named entities, and enhance user experience by searching for the most similar customer calls based on a given query from a subset of their pre-transcribed call data, stored at customer_call_transcriptions.csv.

This project is an opportunity to unlock the potential of machine learning to revolutionize customer support. Let's delve into the interplay between technology and service excellence.

!pip install SpeechRecognition
!pip install pydub
!pip install spacy
!python3 -m spacy download en_core_web_sm

Hidden output

# Import required libraries
import pandas as pd

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import speech_recognition as sr
from pydub import AudioSegment

import spacy

Hidden output

# Load and convert the audio file to the correct format (wav, mono, 16kHz)
audio = AudioSegment.from_file("sample_customer_call.wav")
frame_rate = audio.frame_rate
number_channels = audio.channels

frame_rate, number_channels

# Initialize recognizer
recognizer = sr.Recognizer()

# Load the converted audio file
with sr.AudioFile("sample_customer_call_converted.wav") as source:
    audio_data = recognizer.record(source)
    transcribed_text = recognizer.recognize_google(audio_data)

transcribed_text, frame_rate, number_channels

# Load the CSV file
df = pd.read_csv("customer_call_transcriptions.csv")
df.head()

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply sentiment analysis and assign predicted sentiment
def get_sentiment(text):
    score = sid.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['predicted_sentiment'] = df['text'].apply(get_sentiment)

# Count true positives: predicted positive and actual label is positive
true_positive = len(df.loc[(df['predicted_sentiment'] == 'positive') & (df['sentiment_label'] == 'positive')])
true_positive

import spacy
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Concatenate all transcriptions into one string
all_text = " ".join(df['text'].astype(str))

# Process the text with spaCy
doc = nlp(all_text)

# Extract all named entities (text only)
entities = [ent.text for ent in doc.ents]

# Find the most common entity
if entities:
    most_freq_ent = Counter(entities).most_common(1)[0][0]
else:
    most_freq_ent = None

most_freq_ent

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Prepare the corpus: all call texts + the query
corpus = df['text'].astype(str).tolist()
query = "wrong package delivery"
corpus_with_query = corpus + [query]

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus_with_query)

# Compute cosine similarity between the query and all calls
similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Find the index of the most similar call
most_similar_idx = similarities.argmax()
most_similar_text = corpus[most_similar_idx]
display(df.iloc[most_similar_idx])
print(most_similar_text)

# Given Solution

# # Before you start
# # In order to complete the project you may wish to install SpeechRecognition, Pydub and spaCy libraries and download pretrained spaCy small English Language model.

# !pip install SpeechRecognition
# !pip install pydub
# !pip install spacy
# !python3 -m spacy download en_core_web_sm

# # Import required libraries
# import pandas as pd

# import nltk
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer

# import speech_recognition as sr
# from pydub import AudioSegment

# import spacy

# # Task 1 - Speech to Text: convert the sample audio call, sample_customer_call.wav, to text and store the result in transcribed_text

# # Define a recognizer object
# recognizer = sr.Recognizer()

# # Convert the audio file to audio data
# transcribe_audio_file = sr.AudioFile("sample_customer_call.wav")
# with transcribe_audio_file as source:
#     transcribe_audio = recognizer.record(source)

# # Convert the audio data to text
# transcribed_text = recognizer.recognize_google(transcribe_audio)

# # Review trascribed text
# print("Transcribed text: ", transcribed_text)

# # Task 1 - Speech to Text: store few statistics of the audio file such as number of channels, sample width and frame rate
    
# # Review number of channels and frame rate of the audio file
# audio_segment = AudioSegment.from_file("sample_customer_call.wav")
# number_channels = audio_segment.channels
# frame_rate = audio_segment.frame_rate

# print("Number of channels: ", number_channels)
# print("Frame rate: ", frame_rate)

# # Task 2 - Sentiment Analysis: use vader module from nltk library to determine the sentiment of each text of the customer_call_transcriptions.csv file and store them at a new sentiment_label column using compound score

# # Import customer call transcriptions data
# df = pd.read_csv("customer_call_transcriptions.csv")

# sid = SentimentIntensityAnalyzer()

# # Analyze sentiment by evaluating compound score generated by Vader SentimentIntensityAnalyzer
# def find_sentiment(text):
#     scores = sid.polarity_scores(text)
#     compound_score = scores['compound']

#     if compound_score >= 0.05:
#         return 'positive'
#     elif compound_score <= -0.05:
#         return 'negative'
#     else:
#         return 'neutral'

# df['sentiment_predicted'] = df.apply(lambda row: find_sentiment(row["text"]), axis = 1)

# # Task 2 - Sentiment Analysis: calculate number of texts with positive label that are correctly labeled as positive
# true_positive = len(df.loc[(df['sentiment_predicted'] == df['sentiment_label']) &
#                 (df['sentiment_label'] == 'positive')])

# print("True positives: ", true_positive)

# # Task 3 - Named Entity Recognition: find named entities for each text in the df object and store entities in a named_entities column

# # Load spaCy small English Language model
# nlp = spacy.load("en_core_web_sm")

# # NER using spaCy
# def extract_entities(text):
#     doc = nlp(text)
#     entities = [ent.text for ent in doc.ents]
#     return entities

# # Apply NER to the entire text column
# df['named_entities'] = df['text'].apply(extract_entities)

# # Flatten the list of named entities
# all_entities = [ent for entities in df['named_entities'] for ent in entities]

# # Create a DataFrame with the counts
# entities_df = pd.DataFrame(all_entities, columns=['entity'])
# entities_counts = entities_df['entity'].value_counts().reset_index()
# entities_counts.columns = ['entity', 'count']

# # Extract most frequent named entity
# most_freq_ent = entities_counts["entity"].iloc[0]
# print("Most frequent entity: ", most_freq_ent)

# # Task 4 - Find most similar text: find the list of customer calls that complained about "wrong package delivery" by finding similarity score of each text to the "wrong package delivery" string using spaCy small English Language model

# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Process the text column
# df['processed_text'] = df['text'].apply(lambda text: nlp(text))

# # Input query
# input_query = "wrong package delivery"
# processed_query = nlp(input_query)

# # Calculate similarity scores and sort dataframe with respect to similarity scores
# df['similarity'] = df['processed_text'].apply(lambda text: processed_query.similarity(text))
# df = df.sort_values(by='similarity', ascending=False)

# # Find the most similar text
# most_similar_text = df["text"].iloc[0]
# print("Most similar text: ", most_similar_text)