Skip to content
Project: Data-Driven Product Management: Conducting a Market Analysis
You are a product manager for a fitness studio and are interested in understanding the current demand for digital fitness classes. You plan to conduct a market analysis in Python to gauge demand and identify potential areas for growth of digital products and services.
The Data
You are provided with a number of CSV files in the "Files/data" folder, which offer international and national-level data on Google Trends keyword searches related to fitness and related products.
workout.csv
| Column | Description |
|---|---|
'month' | Month when the data was measured. |
'workout_worldwide' | Index representing the popularity of the keyword 'workout', on a scale of 0 to 100. |
three_keywords.csv
| Column | Description |
|---|---|
'month' | Month when the data was measured. |
'home_workout_worldwide' | Index representing the popularity of the keyword 'home workout', on a scale of 0 to 100. |
'gym_workout_worldwide' | Index representing the popularity of the keyword 'gym workout', on a scale of 0 to 100. |
'home_gym_worldwide' | Index representing the popularity of the keyword 'home gym', on a scale of 0 to 100. |
workout_geo.csv
| Column | Description |
|---|---|
'country' | Country where the data was measured. |
'workout_2018_2023' | Index representing the popularity of the keyword 'workout' during the 5 year period. |
three_keywords_geo.csv
| Column | Description |
|---|---|
'country' | Country where the data was measured. |
'home_workout_2018_2023' | Index representing the popularity of the keyword 'home workout' during the 5 year period. |
'gym_workout_2018_2023' | Index representing the popularity of the keyword 'gym workout' during the 5 year period. |
'home_gym_2018_2023' | Index representing the popularity of the keyword 'home gym' during the 5 year period. |
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import os# Start coding here
possible_paths = [
'Files/data/workout.csv',
'data/workout.csv',
'./workout.csv'
]
file_found = False
for file_path in possible_paths:
if os.path.exists(file_path):
file_found = True
break
if not file_found:
raise FileNotFoundError(
f"File not found in any of the following locations: {possible_paths}. "
"Please check the file path and ensure the file exists."
)
workout_df = pd.read_csv(file_path, parse_dates=['month'])
print(workout_df.head())# Plot the data to visualize peak interest
plt.figure(figsize=(12, 6))
plt.plot(workout_df['month'], workout_df['workout_worldwide'])
plt.title('Global Search Interest for "Workout" Over Time')
plt.xlabel('Date')
plt.ylabel('Interest Index (0-100)')
plt.grid(True)
plt.show()
# Find the row with the maximum 'workout_worldwide' value
peak_workout = workout_df.loc[workout_df['workout_worldwide'].idxmax()]
# Extract the year of peak interest
year_str = peak_workout['month'].year
year_str = str(year_str) # Convert to string in "yyyy" format
print(f"The global search for 'workout' was at its peak in: {year_str}")try:
keywords_df = pd.read_csv('three_keywords.csv', parse_dates=['month'])
except FileNotFoundError:
# Try a relative path if running from project root
try:
keywords_df = pd.read_csv('data/three_keywords.csv', parse_dates=['month'])
except FileNotFoundError:
# If still not found, raise a clear error
raise FileNotFoundError(
"Could not find 'three_keywords.csv'. Please make sure the file exists in the current directory or in a 'data/' subdirectory."
)
print(keywords_df.head())
# Filter data for COVID pandemic period (e.g., March 2020 - Dec 2021)
covid_start = pd.to_datetime('2020-03-01')
covid_end = pd.to_datetime('2021-12-31')
covid_df = keywords_df[(keywords_df['month'] >= covid_start) & (keywords_df['month'] <= covid_end)]
# Calculate average interest during COVID
covid_avg_interest = covid_df[['home_workout_worldwide', 'gym_workout_worldwide', 'home_gym_worldwide']].mean()
peak_covid = covid_avg_interest.idxmax().replace('_worldwide', '')
# Find the most recent year in the dataset
current_year = keywords_df['month'].dt.year.max()
current_df = keywords_df[keywords_df['month'].dt.year == current_year]
# Calculate average interest for the current year
current_avg_interest = current_df[['home_workout_worldwide', 'gym_workout_worldwide', 'home_gym_worldwide']].mean()
current = current_avg_interest.idxmax().replace('_worldwide', '')
print(f"Most popular keyword during COVID pandemic: {peak_covid}")
print(f"Most popular keyword in the current year ({current_year}): {current}")# Load the workout_geo data
file_paths = ['workout_geo.csv', 'data/workout_geo.csv', 'Files/data/workout_geo.csv']
for path in file_paths:
if os.path.exists(path):
workout_geo_df = pd.read_csv(path)
break
else:
raise FileNotFoundError("Could not find 'workout_geo.csv' in the current, 'data/', or 'Files/data/' directories.")
print(workout_geo_df.head())
# Filter for the specified countries
target_countries = ['United States', 'Australia', 'Japan']
filtered_geo_df = workout_geo_df[workout_geo_df['country'].isin(target_countries)]
# Find the country with the highest interest
top_country = filtered_geo_df.loc[filtered_geo_df['workout_2018_2023'].idxmax()]['country']
print(f"The country with the highest interest in workouts among the United States, Australia, or Japan is: {top_country}")import pandas as pd
csv_path = 'data/three_keywords_geo.csv'
try:
# Load the three_keywords_geo data
keywords_geo_df = pd.read_csv(csv_path)
print(keywords_geo_df.head())
keywords_geo_df.columns = keywords_geo_df.columns.str.strip()
print("Columns after stripping:", keywords_geo_df.columns.tolist())
country_col = None
for col in keywords_geo_df.columns:
if col.strip().lower() == 'country':
country_col = col
break
if country_col is None:
raise KeyError("No 'country' column found in the CSV file after stripping whitespace.")
# Filter for the specified countries (Philippines and Malaysia)
target_home_workout_countries = ['Philippines', 'Malaysia']
filtered_home_workout_geo_df = keywords_geo_df[keywords_geo_df[country_col].isin(target_home_workout_countries)]
# Find the country with the highest interest in 'home workout'
hw_col = None
for col in keywords_geo_df.columns:
if col.strip().lower() == 'home_workout_2018_2023':
hw_col = col
break
if hw_col is None:
raise KeyError("No 'home_workout_2018_2023' column found in the CSV file after stripping whitespace.")
home_workout_geo = filtered_home_workout_geo_df.loc[filtered_home_workout_geo_df[hw_col].idxmax()][country_col]
print(f"Between the Philippines and Malaysia, {home_workout_geo} has the highest interest in home workouts.")
except FileNotFoundError:
print(f"File not found: {csv_path}. Please check the file path and try again.")
except KeyError as e:
print(f"KeyError: {e}. Please check the column names in your CSV file.")