You are a product manager for a fitness studio based in Singapore and are interested in understanding the types of digital products you should offer. You plan to conduct a market analysis in Python to understand how to place your digital fitness products in the regional market. A market analysis will allow you to identify strengths of your competitors, gauge demand, and create unique new digital products and services for potential users.
You are provided with a number of CSV files in the Files-"data" folder, which offer international data on Google Trends and YouTube keyword searches related to fitness and related products. Two helper functions have also been provided, read_file and read_geo, to help you process and visualize these CSV files for further analysis.
You'll use pandas methods to explore this data and drive your product management insights.
You can continue beyond the bounds of this project and also investigate in-person classes, local gyms, and online personal instructors!
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os
def read_file(filepath, plot = True):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com.
"""
file = pd.read_csv(filepath, header=1)
df = file.set_index('Week').stack().reset_index()
df.columns = ['week','region','interest']
df['week'] = pd.to_datetime(df['week'])
plt.figure(figsize=(8,3))
df = df[df['interest']!="<1"]
df['interest'] = df['interest'].astype(float)
if plot:
sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
return df
def read_geo(filepath, multi=False):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
"""
file = pd.read_csv(filepath, header=1)
if not multi:
file.columns = ['country', 'interest']
plt.figure(figsize=(8,4))
sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')
if multi:
plt.figure(figsize=(3,8))
file = file.set_index('Country').stack().reset_index()
file.columns = ['country','category','interest']
file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')
file = file.sort_values(ascending=False,by='interest')
return file
# Plot demand of working out over time
workout = read_file('data/workout.csv')
# Find mean interest in working by month
workout.set_index('week', inplace = True)
workout_by_month = workout.resample('MS').mean()
# Find month with highest interest, save as string in 'yyyy-mm-dd' format
month_high = workout_by_month[workout_by_month['interest'] == workout_by_month['interest'].max()]
month_str = str(month_high.index[0].date())
print(month_str)import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os
def read_file(filepath, plot = True):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com.
"""
file = pd.read_csv(filepath, header=1)
df = file.set_index('Week').stack().reset_index()
df.columns = ['week','region','interest']
df['week'] = pd.to_datetime(df['week'])
plt.figure(figsize=(8,3))
df = df[df['interest']!="<1"]
df['interest'] = df['interest'].astype(float)
if plot:
sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
return df
def read_geo(filepath, multi=False):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
"""
file = pd.read_csv(filepath, header=1)
if not multi:
file.columns = ['country', 'interest']
plt.figure(figsize=(8,4))
sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')
if multi:
plt.figure(figsize=(3,8))
file = file.set_index('Country').stack().reset_index()
file.columns = ['country','category','interest']
file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')
file = file.sort_values(ascending=False,by='interest')
return file
# Plot keyword interest
three_keywords = read_file('data/three_keywords.csv')
current = 'gym workout'
peak_covid = 'home workout'import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os
def read_file(filepath, plot = True):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com.
"""
file = pd.read_csv(filepath, header=1)
df = file.set_index('Week').stack().reset_index()
df.columns = ['week','region','interest']
df['week'] = pd.to_datetime(df['week'])
plt.figure(figsize=(8,3))
df = df[df['interest']!="<1"]
df['interest'] = df['interest'].astype(float)
if plot:
sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
return df
def read_geo(filepath, multi=False):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
"""
file = pd.read_csv(filepath, header=1)
if not multi:
file.columns = ['country', 'interest']
plt.figure(figsize=(8,4))
sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')
if multi:
plt.figure(figsize=(3,8))
file = file.set_index('Country').stack().reset_index()
file.columns = ['country','category','interest']
file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')
file = file.sort_values(ascending=False,by='interest')
return file
# Read global interest in exercise
workout_global = read_geo('data/workout_global.csv')
# Get top 25 countries
top_25_countries = workout_global.head(25)
# String value of top country
top_country = top_25_countries['country'].iloc[0]
print(top_country)import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os
def read_file(filepath, plot = True):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com.
"""
file = pd.read_csv(filepath, header=1)
df = file.set_index('Week').stack().reset_index()
df.columns = ['week','region','interest']
df['week'] = pd.to_datetime(df['week'])
plt.figure(figsize=(8,3))
df = df[df['interest']!="<1"]
df['interest'] = df['interest'].astype(float)
if plot:
sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
return df
def read_geo(filepath, multi=False):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
"""
file = pd.read_csv(filepath, header=1)
if not multi:
file.columns = ['country', 'interest']
plt.figure(figsize=(8,4))
sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')
if multi:
plt.figure(figsize=(3,8))
file = file.set_index('Country').stack().reset_index()
file.columns = ['country','category','interest']
file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')
file = file.sort_values(ascending=False,by='interest')
return file
# Read three keywords by geographical location
geo_categories = read_geo('data/geo_three_keywords.csv', multi = True)
# Create Mesa countries list
mesa_countries = ["Philippines", "Singapore", "United Arab Emirates", "Qatar", "Kuwait", "Lebanon", "Malaysia", "Sri Lanka", "India", "Pakistan"]
# Fliter geo_categories by MESA countries
MESA = geo_categories.loc[geo_categories.country.isin(mesa_countries), :]
# Set country and category index
MESA.set_index(['country', 'category']).unstack()
print(MESA)
top_home_workout_country = 'Philippines'import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white', palette='Pastel2')
import os
def read_file(filepath, plot = True):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with three columns: 'week', 'region', and 'interest'. Generate a line plot using Seaborn to visualize the data. This corresponds to the first graphic (time series) returned by trends.google.com.
"""
file = pd.read_csv(filepath, header=1)
df = file.set_index('Week').stack().reset_index()
df.columns = ['week','region','interest']
df['week'] = pd.to_datetime(df['week'])
plt.figure(figsize=(8,3))
df = df[df['interest']!="<1"]
df['interest'] = df['interest'].astype(float)
if plot:
sns.lineplot(data = df, x= 'week', y= 'interest',hue='region')
return df
def read_geo(filepath, multi=False):
"""
Read a CSV file from a given filepath, convert it into a pandas DataFrame,
and return a processed DataFrame with two columns: 'country' and 'interest'. Generate a bar plot using Seaborn to visualize the data. This corresponds to the second graphic returned by trends.google.com. Use multi=False if only one keyword is being analyzed, and multi=True if more than one keyword is being analyzed.
"""
file = pd.read_csv(filepath, header=1)
if not multi:
file.columns = ['country', 'interest']
plt.figure(figsize=(8,4))
sns.barplot(data = file.dropna().iloc[:25,:], y = 'country', x='interest')
if multi:
plt.figure(figsize=(3,8))
file = file.set_index('Country').stack().reset_index()
file.columns = ['country','category','interest']
file['interest'] = pd.to_numeric(file['interest'].apply(lambda x: x[:-1]))
sns.barplot(data=file.dropna(), y = 'country', x='interest', hue='category')
file = file.sort_values(ascending=False,by='interest')
return file
# Read country specific files: Singapore and Philippines
sng = read_file('data/yoga_zumba_sng.csv')
phl = read_file('data/yoga_zumba_phl.csv')
pilot_content = ['yoga', 'zumba']