Project: Data-Driven Product Management: Conducting a Market Analysis

You are a product manager for a fitness studio and are interested in understanding the current demand for digital fitness classes. You plan to conduct a market analysis in Python to gauge demand and identify potential areas for growth of digital products and services.

The Data

You are provided with a number of CSV files in the "Files/data" folder, which offer international and national-level data on Google Trends keyword searches related to fitness and related products.

workout.csv

Column	Description
`'month'`	Month when the data was measured.
`'workout_worldwide'`	Index representing the popularity of the keyword 'workout', on a scale of 0 to 100.

three_keywords.csv

Column	Description
`'month'`	Month when the data was measured.
`'home_workout_worldwide'`	Index representing the popularity of the keyword 'home workout', on a scale of 0 to 100.
`'gym_workout_worldwide'`	Index representing the popularity of the keyword 'gym workout', on a scale of 0 to 100.
`'home_gym_worldwide'`	Index representing the popularity of the keyword 'home gym', on a scale of 0 to 100.

workout_geo.csv

Column	Description
`'country'`	Country where the data was measured.
`'workout_2018_2023'`	Index representing the popularity of the keyword 'workout' during the 5 year period.

three_keywords_geo.csv

Column	Description
`'country'`	Country where the data was measured.
`'home_workout_2018_2023'`	Index representing the popularity of the keyword 'home workout' during the 5 year period.
`'gym_workout_2018_2023'`	Index representing the popularity of the keyword 'gym workout' during the 5 year period.
`'home_gym_2018_2023'`	Index representing the popularity of the keyword 'home gym' during the 5 year period.

# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import io

# Configuración para mejores visualizaciones
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# ===============================================
# PASO 1: CARGA Y EXPLORACIÓN INICIAL DE DATOS
# ===============================================

print("=== ANÁLISIS DE MERCADO FITNESS - GOOGLE TRENDS ===\n")

# Cargar todos los datasets
workout_df = pd.read_csv('data/workout.csv')
workout_geo_df = pd.read_csv('data/workout_geo.csv')
three_keywords_df = pd.read_csv('data/three_keywords.csv')
three_keywords_geo_df = pd.read_csv('data/three_keywords_geo.csv')

print("📁 ARCHIVOS CARGADOS:")
print(f"✅ workout.csv: {workout_df.shape}")
print(f"✅ workout_geo.csv: {workout_geo_df.shape}")
print(f"✅ three_keywords.csv: {three_keywords_df.shape}")
print(f"✅ three_keywords_geo.csv: {three_keywords_geo_df.shape}")

print("\n" + "="*60)
print("🔍 EXPLORACIÓN INICIAL")
print("="*60)

# Explorar workout.csv (tendencias temporales de 'workout')
print("\n1️⃣ WORKOUT.CSV - Tendencias temporales 'workout':")
print("Columnas:", list(workout_df.columns))
print("Tipos de datos:")
print(workout_df.dtypes)
print("\nPrimeras 5 filas:")
print(workout_df.head())
print("\nEstadísticas descriptivas:")
print(workout_df.describe())

# Explorar three_keywords.csv
print("\n2️⃣ THREE_KEYWORDS.CSV - Tendencias de 3 keywords:")
print("Columnas:", list(three_keywords_df.columns))
print("Tipos de datos:")
print(three_keywords_df.dtypes)
print("\nPrimeras 5 filas:")
print(three_keywords_df.head())

# Verificar rangos de fechas
print("\n📅 RANGO TEMPORAL:")
print(f"Primer mes: {workout_df['month'].iloc[0]}")
print(f"Último mes: {workout_df['month'].iloc[-1]}")
print(f"Total de meses: {len(workout_df)}")

# Verificar valores faltantes
print("\n🔍 CALIDAD DE DATOS:")
print("Valores faltantes en workout_df:")
print(workout_df.isnull().sum())
print("\nValores faltantes en three_keywords_df:")
print(three_keywords_df.isnull().sum())

print("\n" + "="*60)
print("📊 VISTA PREVIA DE DATOS GEOGRÁFICOS")
print("="*60)

# Explorar datos geográficos
print("\n3️⃣ WORKOUT_GEO.CSV - Datos por país:")
print("Columnas:", list(workout_geo_df.columns))
print(f"Número de países: {len(workout_geo_df)}")
print("\nPrimeros 10 países:")
print(workout_geo_df.head(10))

print("\n4️⃣ THREE_KEYWORDS_GEO.CSV - Keywords por país:")
print("Columnas:", list(three_keywords_geo_df.columns))
print(f"Número de países: {len(three_keywords_geo_df)}")
print("\nPrimeros 5 países:")
print(three_keywords_geo_df.head())

print("\n✅ DATOS CARGADOS EXITOSAMENTE!")

# ===============================================
# PASO 2: ANÁLISIS TEMPORAL Y PREGUNTAS 1-2
# ===============================================

print("="*70)
print("📈 ANÁLISIS TEMPORAL - RESOLVIENDO PREGUNTAS")
print("="*70)

# -----------------------------------------------
# PREGUNTA 1: ¿Cuándo fue el pico de 'workout'?
# -----------------------------------------------

print("\n🎯 PREGUNTA 1: Pico global de búsqueda 'workout'")
print("-" * 50)

# Encontrar el valor máximo y su fecha
max_value = workout_df['workout_worldwide'].max()
peak_row = workout_df[workout_df['workout_worldwide'] == max_value]

print(f"Valor máximo: {max_value}")
print(f"Mes del pico: {peak_row['month'].iloc[0]}")

# Extraer el año (formato solicitado: "yyyy")
peak_month = peak_row['month'].iloc[0]
year_str = peak_month.split('-')[0]

print(f"\n✅ RESPUESTA 1:")
print(f"year_str = \"{year_str}\"")

# Mostrar contexto alrededor del pico
print(f"\n📊 Contexto alrededor del pico:")
peak_index = peak_row.index[0]
context_start = max(0, peak_index - 3)
context_end = min(len(workout_df), peak_index + 4)

for i in range(context_start, context_end):
    marker = " 🔥 PICO" if i == peak_index else ""
    print(f"{workout_df.iloc[i]['month']}: {workout_df.iloc[i]['workout_worldwide']}{marker}")

# -----------------------------------------------
# PREGUNTA 2: COVID vs Actual - Keywords más populares
# -----------------------------------------------

print(f"\n🎯 PREGUNTA 2: Keywords durante COVID vs Actual")
print("-" * 50)

# Convertir columna month a datetime para facilitar filtros
three_keywords_df['date'] = pd.to_datetime(three_keywords_df['month'])

# Definir períodos
# COVID peak: Abril-Julio 2020 (basado en que abril 2020 fue el pico de workout)
covid_start = '2020-04'
covid_end = '2020-07'

# Período actual: últimos 6 meses disponibles
actual_start = '2022-10'  # Últimos 6 meses antes de marzo 2023
actual_end = '2023-03'

print(f"Período COVID: {covid_start} a {covid_end}")
print(f"Período Actual: {actual_start} a {actual_end}")

# Filtrar datos para período COVID
covid_mask = (three_keywords_df['month'] >= covid_start) & (three_keywords_df['month'] <= covid_end)
covid_data = three_keywords_df[covid_mask]

# Filtrar datos para período actual
actual_mask = (three_keywords_df['month'] >= actual_start) & (three_keywords_df['month'] <= actual_end)
actual_data = three_keywords_df[actual_mask]

print(f"\nDatos COVID: {len(covid_data)} meses")
print(f"Datos Actual: {len(actual_data)} meses")

# Calcular promedios para cada keyword en cada período
covid_means = {
    'home_workout': covid_data['home_workout_worldwide'].mean(),
    'gym_workout': covid_data['gym_workout_worldwide'].mean(),
    'home_gym': covid_data['home_gym_worldwide'].mean()
}

actual_means = {
    'home_workout': actual_data['home_workout_worldwide'].mean(),
    'gym_workout': actual_data['gym_workout_worldwide'].mean(),
    'home_gym': actual_data['home_gym_worldwide'].mean()
}

print(f"\n📊 Promedios durante COVID:")
for keyword, value in covid_means.items():
    print(f"   {keyword}: {value:.1f}")

print(f"\n📊 Promedios actuales:")
for keyword, value in actual_means.items():
    print(f"   {keyword}: {value:.1f}")

# Encontrar la keyword más popular en cada período
peak_covid = max(covid_means, key=covid_means.get)
current = max(actual_means, key=actual_means.get)

print(f"\n✅ RESPUESTA 2:")
print(f"peak_covid = \"{peak_covid}\"")
print(f"current = \"{current}\"")

# -----------------------------------------------
# Visualización de tendencias temporales
# -----------------------------------------------

print(f"\n📈 Creando visualización de tendencias...")

# Crear gráfico de tendencias
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Gráfico 1: Tendencia de 'workout'
ax1.plot(range(len(workout_df)), workout_df['workout_worldwide'], 
         linewidth=2, color='blue', marker='o', markersize=4)
ax1.axvline(x=peak_index, color='red', linestyle='--', alpha=0.7, 
           label=f'Pico: {peak_month}')
ax1.set_title('Tendencia Global de Búsquedas "workout" (2018-2023)', fontsize=14, fontweight='bold')
ax1.set_ylabel('Índice de Popularidad (0-100)')
ax1.grid(True, alpha=0.3)
ax1.legend()

# Personalizar etiquetas del eje x
month_labels = [workout_df.iloc[i]['month'] if i % 6 == 0 else '' 
                for i in range(len(workout_df))]
ax1.set_xticks(range(len(workout_df)))
ax1.set_xticklabels(month_labels, rotation=45)

# Gráfico 2: Comparación de las 3 keywords
ax2.plot(range(len(three_keywords_df)), three_keywords_df['home_workout_worldwide'], 
         linewidth=2, color='green', marker='s', markersize=3, label='home_workout')
ax2.plot(range(len(three_keywords_df)), three_keywords_df['gym_workout_worldwide'], 
         linewidth=2, color='orange', marker='^', markersize=3, label='gym_workout')
ax2.plot(range(len(three_keywords_df)), three_keywords_df['home_gym_worldwide'], 
         linewidth=2, color='purple', marker='D', markersize=3, label='home_gym')

ax2.set_title('Comparación de Keywords Fitness (2018-2023)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Índice de Popularidad (0-100)')
ax2.set_xlabel('Período')
ax2.grid(True, alpha=0.3)
ax2.legend()
ax2.set_xticks(range(0, len(three_keywords_df), 6))
ax2.set_xticklabels([three_keywords_df.iloc[i]['month'] for i in range(0, len(three_keywords_df), 6)], rotation=45)

plt.tight_layout()
plt.show()

print("\n✅ PASO 2 COMPLETADO!")
print("📊 Visualizaciones generadas")
print("🎯 Preguntas 1 y 2 resueltas")

# ===============================================
# PASO 3: ANÁLISIS GEOGRÁFICO - PREGUNTAS 3-4
# ===============================================

print("="*70)
print("🌍 ANÁLISIS GEOGRÁFICO - PREGUNTAS FINALES")
print("="*70)

# -----------------------------------------------
# PREGUNTA 3: País con mayor interés en workouts
# (entre United States, Australia, Japan)
# -----------------------------------------------

print("\n🎯 PREGUNTA 3: País con mayor interés en 'workout'")
print("Comparando: United States, Australia, Japan")
print("-" * 55)

# Países objetivo
target_countries = ['United States', 'Australia', 'Japan']

# Buscar datos de cada país
country_workout_data = []

for country in target_countries:
    # Buscar el país en el dataset
    country_row = workout_geo_df[workout_geo_df['country'] == country]
    
    if not country_row.empty:
        value = country_row['workout_2018_2023'].iloc[0]
        country_workout_data.append({
            'country': country,
            'workout_interest': value
        })
        print(f"✅ {country}: {value}")
    else:
        print(f"❌ {country}: No encontrado")

# Convertir a DataFrame para facilitar análisis
comparison_df = pd.DataFrame(country_workout_data)

# Encontrar el país con mayor interés
if not comparison_df.empty:
    top_country_row = comparison_df.loc[comparison_df['workout_interest'].idxmax()]
    top_country = top_country_row['country']
    
    print(f"\n✅ RESPUESTA 3:")
    print(f"top_country = \"{top_country}\"")
    print(f"Valor: {top_country_row['workout_interest']}")
else:
    print("❌ No se encontraron datos para los países objetivo")

# -----------------------------------------------
# PREGUNTA 4: Filipinas vs Malasia - Home workouts
# -----------------------------------------------

print(f"\n🎯 PREGUNTA 4: Filipinas vs Malasia - Home workouts")
print("-" * 50)

# Verificar nombres de columnas (hay inconsistencia: 'Country' vs 'country')
print("Columnas disponibles en three_keywords_geo_df:")
print(three_keywords_geo_df.columns.tolist())

# Países objetivo para home workouts
target_countries_2 = ['Philippines', 'Malaysia']

# Buscar datos de cada país
home_workout_data = []

for country in target_countries_2:
    # Buscar el país (verificar tanto 'Country' como 'country')
    country_row = three_keywords_geo_df[
        (three_keywords_geo_df['Country'] == country) |
        (three_keywords_geo_df.get('country', pd.Series()) == country)
    ]
    
    if not country_row.empty:
        value = country_row['home_workout_2018_2023'].iloc[0]
        home_workout_data.append({
            'country': country,
            'home_workout_interest': value
        })
        print(f"✅ {country}: {value}")
    else:
        print(f"❌ {country}: No encontrado")
        # Buscar países similares para debugging
        similar = three_keywords_geo_df[
            three_keywords_geo_df['Country'].str.contains(country[:4], case=False, na=False)
        ]
        if not similar.empty:
            print(f"   Países similares encontrados: {similar['Country'].tolist()}")

# Convertir a DataFrame
home_workout_comparison_df = pd.DataFrame(home_workout_data)

# Encontrar el país con mayor interés en home workouts
if not home_workout_comparison_df.empty and len(home_workout_comparison_df) == 2:
    top_home_workout_row = home_workout_comparison_df.loc[
        home_workout_comparison_df['home_workout_interest'].idxmax()
    ]
    home_workout_geo = top_home_workout_row['country']
    
    print(f"\n✅ RESPUESTA 4:")
    print(f"home_workout_geo = \"{home_workout_geo}\"")
    print(f"Valor: {top_home_workout_row['home_workout_interest']}")
else:
    print("❌ No se encontraron datos completos para ambos países")

# -----------------------------------------------
# Análisis adicional de datos geográficos
# -----------------------------------------------

print(f"\n📊 ANÁLISIS ADICIONAL DE DATOS GEOGRÁFICOS")
print("-" * 50)

# Top 10 países con mayor interés en workouts (excluyendo NaN)
print("\n🏆 TOP 10 países con mayor interés en 'workout':")
top_workout_countries = workout_geo_df.dropna(subset=['workout_2018_2023']).nlargest(10, 'workout_2018_2023')
for i, (_, row) in enumerate(top_workout_countries.iterrows(), 1):
    print(f"{i:2}. {row['country']}: {row['workout_2018_2023']}")

# Top 10 países con mayor interés en home workouts
print("\n🏠 TOP 10 países con mayor interés en 'home workout':")
top_home_workout_countries = three_keywords_geo_df.dropna(subset=['home_workout_2018_2023']).nlargest(10, 'home_workout_2018_2023')
for i, (_, row) in enumerate(top_home_workout_countries.iterrows(), 1):
    print(f"{i:2}. {row['Country']}: {row['home_workout_2018_2023']}")

# Estadísticas generales
print(f"\n📈 ESTADÍSTICAS GENERALES:")
print(f"Países con datos de 'workout': {workout_geo_df['workout_2018_2023'].notna().sum()}/{len(workout_geo_df)}")
print(f"Países con datos de 'home_workout': {three_keywords_geo_df['home_workout_2018_2023'].notna().sum()}/{len(three_keywords_geo_df)}")

# Verificar si hay datos faltantes para países importantes
important_countries = ['United States', 'Australia', 'Japan', 'Philippines', 'Malaysia', 
                      'Canada', 'United Kingdom', 'Germany', 'France', 'Brazil']

print(f"\n🔍 VERIFICACIÓN DE PAÍSES IMPORTANTES:")
for country in important_countries:
    workout_exists = not workout_geo_df[workout_geo_df['country'] == country].empty
    home_workout_exists = not three_keywords_geo_df[three_keywords_geo_df['Country'] == country].empty
    print(f"{country}: Workout({workout_exists}) | Home_workout({home_workout_exists})")

print("\n✅ PASO 3 COMPLETADO!")
print("🌍 Análisis geográfico finalizado")
print("🎯 Todas las preguntas resueltas")