Project: Data-Driven Product Management: Conducting a Market Analysis

You are a product manager for a fitness studio and are interested in understanding the current demand for digital fitness classes. You plan to conduct a market analysis in Python to gauge demand and identify potential areas for growth of digital products and services.

The Data

You are provided with a number of CSV files in the "Files/data" folder, which offer international and national-level data on Google Trends keyword searches related to fitness and related products.

workout.csv

Column	Description
`'month'`	Month when the data was measured.
`'workout_worldwide'`	Index representing the popularity of the keyword 'workout', on a scale of 0 to 100.

three_keywords.csv

Column	Description
`'month'`	Month when the data was measured.
`'home_workout_worldwide'`	Index representing the popularity of the keyword 'home workout', on a scale of 0 to 100.
`'gym_workout_worldwide'`	Index representing the popularity of the keyword 'gym workout', on a scale of 0 to 100.
`'home_gym_worldwide'`	Index representing the popularity of the keyword 'home gym', on a scale of 0 to 100.

workout_geo.csv

Column	Description
`'country'`	Country where the data was measured.
`'workout_2018_2023'`	Index representing the popularity of the keyword 'workout' during the 5 year period.

three_keywords_geo.csv

Column	Description
`'country'`	Country where the data was measured.
`'home_workout_2018_2023'`	Index representing the popularity of the keyword 'home workout' during the 5 year period.
`'gym_workout_2018_2023'`	Index representing the popularity of the keyword 'gym workout' during the 5 year period.
`'home_gym_2018_2023'`	Index representing the popularity of the keyword 'home gym' during the 5 year period.

# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

# Start coding here
possible_paths = [
    'Files/data/workout.csv',
    'data/workout.csv',
    './workout.csv'
]

file_found = False
for file_path in possible_paths:
    if os.path.exists(file_path):
        file_found = True
        break

if not file_found:
    raise FileNotFoundError(
        f"File not found in any of the following locations: {possible_paths}. "
        "Please check the file path and ensure the file exists."
    )

workout_df = pd.read_csv(file_path, parse_dates=['month'])
print(workout_df.head())

# Plot the data to visualize peak interest
plt.figure(figsize=(12, 6))
plt.plot(workout_df['month'], workout_df['workout_worldwide'])
plt.title('Global Search Interest for "Workout" Over Time')
plt.xlabel('Date')
plt.ylabel('Interest Index (0-100)')
plt.grid(True)
plt.show()

# Find the row with the maximum 'workout_worldwide' value
peak_workout = workout_df.loc[workout_df['workout_worldwide'].idxmax()]

# Extract the year of peak interest
year_str = peak_workout['month'].year
year_str = str(year_str) # Convert to string in "yyyy" format

print(f"The global search for 'workout' was at its peak in: {year_str}")

try:
    keywords_df = pd.read_csv('three_keywords.csv', parse_dates=['month'])
except FileNotFoundError:
    # Try a relative path if running from project root
    try:
        keywords_df = pd.read_csv('data/three_keywords.csv', parse_dates=['month'])
    except FileNotFoundError:
        # If still not found, raise a clear error
        raise FileNotFoundError(
            "Could not find 'three_keywords.csv'. Please make sure the file exists in the current directory or in a 'data/' subdirectory."
        )

print(keywords_df.head())

# Filter data for COVID pandemic period (e.g., March 2020 - Dec 2021)
covid_start = pd.to_datetime('2020-03-01')
covid_end = pd.to_datetime('2021-12-31')
covid_df = keywords_df[(keywords_df['month'] >= covid_start) & (keywords_df['month'] <= covid_end)]

# Calculate average interest during COVID
covid_avg_interest = covid_df[['home_workout_worldwide', 'gym_workout_worldwide', 'home_gym_worldwide']].mean()
peak_covid = covid_avg_interest.idxmax().replace('_worldwide', '')

# Find the most recent year in the dataset
current_year = keywords_df['month'].dt.year.max()
current_df = keywords_df[keywords_df['month'].dt.year == current_year]

# Calculate average interest for the current year
current_avg_interest = current_df[['home_workout_worldwide', 'gym_workout_worldwide', 'home_gym_worldwide']].mean()
current = current_avg_interest.idxmax().replace('_worldwide', '')

print(f"Most popular keyword during COVID pandemic: {peak_covid}")
print(f"Most popular keyword in the current year ({current_year}): {current}")

# Load the workout_geo data
file_paths = ['workout_geo.csv', 'data/workout_geo.csv', 'Files/data/workout_geo.csv']
for path in file_paths:
    if os.path.exists(path):
        workout_geo_df = pd.read_csv(path)
        break
else:
    raise FileNotFoundError("Could not find 'workout_geo.csv' in the current, 'data/', or 'Files/data/' directories.")

print(workout_geo_df.head())

# Filter for the specified countries
target_countries = ['United States', 'Australia', 'Japan']
filtered_geo_df = workout_geo_df[workout_geo_df['country'].isin(target_countries)]

# Find the country with the highest interest
top_country = filtered_geo_df.loc[filtered_geo_df['workout_2018_2023'].idxmax()]['country']

print(f"The country with the highest interest in workouts among the United States, Australia, or Japan is: {top_country}")