baby names analysis in USA

# Importing pandas and matplotlib and necessary modules
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.ticker import EngFormatter
# Read in the baby names CSV as a DataFram
names = pd.read_csv("baby_names.csv")

# Check the data types and missing values
print(names.info())

# Check for unique values in categorical columns
print(names['first_name'].nunique())
print(names['sex'].unique())

Hidden output

1 hidden cell

# Group by year and sex, then sum the number of babies
yearly_trends_sex = names.groupby(['year', 'sex'])['num'].sum().unstack().reset_index()

# Create an interactive plot
fig = go.Figure()

# Add trace for baby girls
fig.add_trace(go.Scatter(
    x=yearly_trends_sex['year'], 
    y=yearly_trends_sex['F'],
    mode='lines', 
    name='Female', 
    line=dict(color='purple'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Add trace for baby boys
fig.add_trace(go.Scatter(
    x=yearly_trends_sex['year'], 
    y=yearly_trends_sex['M'],
    mode='lines', 
    name='Male', 
    line=dict(color='green'),
    hovertemplate='Year: %{x}<br>Number of Babies: %{y:,}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Total Number of Babies Named Each Year by Gender',
    xaxis_title='Year',
    yaxis_title='Number of Babies',
    yaxis_tickformat=',',  # Use comma as a thousands separator
    legend_title='Gender',
    template='plotly_white'
)

# Show the plot
fig.show()

# Group by 'first_name' and sum the 'num' column
grouped_names = names.groupby('first_name')['num'].sum().reset_index() # this will be use in next cell
grouped_names.rename(columns={'num': 'number'}, inplace=True)
# Order by 'num' in descending order
top_names = grouped_names.sort_values(by='number', ascending=False)
# Display the result
top_names

Classify each name's popularity according to the number of years that the name appears in the dataset.

# Group by 'first_name' and sum the 'num' values
popular_names_sum = names.groupby('first_name')['num'].sum().reset_index()
popular_names_sum.rename(columns={'num': 'number'}, inplace=True)
# Group by 'first_name' and count the number of unique years each name appears in
popular_names_years = names.groupby('first_name')['year'].nunique().reset_index()
# Rename the 'year' column to 'year_count' for clarity
popular_names_years.rename(columns={'year': 'year_count'}, inplace=True)
# Merge the two DataFrames on 'first_name'
popular_names = pd.merge(popular_names_sum, popular_names_years, on='first_name')

# This function can be used to classify names based on their frequency in the dataset.
def classify_popularity(year_count):
    if year_count > 80:
        return 'Classic'
    elif year_count > 50:
        return 'Semi-classic'
    elif year_count > 20:
        return 'Semi-trendy'
    else:
        return 'Trendy'
# Add a new column 'popularity_type' for the classification
popular_names['popularity_type'] = popular_names['year_count'].apply(classify_popularity)
# Drop the 'year_count' column as it's no longer needed
popular_names.drop(columns=['year_count'], inplace=True)
# Order the results alphabetically by 'first_name'
result = popular_names.sort_values(by='first_name')

# Display the result
result.head(20)

# Create a bar chart to visualize the number of names in each popularity type
df = result['popularity_type'].value_counts().reset_index()
df.columns = ['popularity_type', 'count']

fig = px.bar(df, 
             x='popularity_type', 
             y='count', 
             title='Number of Names by Popularity Type',
             labels={'popularity_type': 'Popularity Type', 'count': 'Number of Names'},
             color='popularity_type')  # Add color based on popularity type

# Customize the hover template to show the real number
fig.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}')

# Show the plot
fig.show()

# Sort the DataFrame by 'number' in descending order and select the top 20 names
top_20 = result.sort_values(by='number', ascending=False).head(20)

# Create an interactive bar chart using Plotly
fig = px.bar(top_20, 
             x='first_name', 
             y='number', 
             color='popularity_type', 
             title='Top 20 Popular Names',
             labels={'first_name': 'First Name', 'number': 'Number of Occurrences'},
             hover_data={'popularity_type': True})

# Customize the hover template to show the real number and popularity type
fig.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}<br>Popularity: %{customdata[0]}')

# Show the plot
fig.show()

# Merge the classification back to the original DataFrame
names_with_popularity = names.merge(popular_names[['first_name', 'popularity_type']], on='first_name')
# Group by 'year' and 'popularity_type', then count the number of occurrences
popularity_over_years = names_with_popularity.groupby(['year', 'popularity_type']).size().reset_index(name='count')
# Plot the data using Plotly
fig = px.line(popularity_over_years, x='year', y='count', color='popularity_type', 
              title='Popularity Types Over Years', labels={'year': 'Year', 'count': 'total of names'})
# Customize the legend title
fig.update_layout(legend_title_text='Popularity Type')

# Show the plot
fig.show()

# Define a consistent color map for popularity types
color_map = {
    'Classic': '#facc5f' ,  # Yellow
    'Semi-classic': '#43d7a4',  # Green
    'Semi-trendy': '#4095db',  # Blue
    'Trendy': '#6568a0'   # purple
}
# First chart: Number of Names by Popularity Type
df = result['popularity_type'].value_counts().reset_index()
df.columns = ['popularity_type', 'count']
fig1 = px.bar(df,
              x='popularity_type',
              y='count',
              title='Number of Names by Popularity Type',
              labels={'popularity_type': 'Popularity Type', 'count': 'Number of Names'},
              color='popularity_type',
              color_discrete_map=color_map)
fig1.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}')

# Second chart: Top 20 Popular Names
top_20 = result.sort_values(by='number', ascending=False).head(20)
fig2 = px.bar(top_20,
              x='first_name',
              y='number',
              color='popularity_type',
              title='Top 20 Popular Names',
              labels={'first_name': 'First Name', 'number': 'Number of Occurrences'},
              hover_data={'popularity_type': True},
              color_discrete_map=color_map)
fig2.update_traces(hovertemplate='<b>%{x}</b><br>Number: %{y:,}<br>Popularity: %{customdata[0]}')
# Third chart: Popularity Types Over Years
names_with_popularity = names.merge(popular_names[['first_name', 'popularity_type']], on='first_name')
popularity_over_years = names_with_popularity.groupby(['year', 'popularity_type']).size().reset_index(name='count')
fig3 = px.line(popularity_over_years,
               x='year',
               y='count',
               color='popularity_type',
               title='Popularity Types Over Years',
               labels={'year': 'Year', 'count': 'Total of Names'},
               color_discrete_map=color_map)
fig3.update_layout(legend_title_text='Popularity Type')

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=("Number of Names by Popularity Type", "Top 20 Popular Names", "Popularity Types Over Years"))

# Add the first chart to the first subplot
for trace in fig1['data']:
    fig.add_trace(trace, row=1, col=1)

# Add the second chart to the second subplot
for trace in fig2['data']:
    fig.add_trace(trace, row=1, col=2)

# Add the third chart to the third subplot
for trace in fig3['data']:
    fig.add_trace(trace, row=2, col=1)

# Update layout
fig.update_layout(title_text="Comparison of Popularity Types and Top 20 Names", showlegend=True, height=800)

# Show the plot
fig.show()

# Assuming 'all_years' is a set of all possible years in the dataset
all_years = set(names['year'].unique())

# Function to find missing years for each name
def find_missing_years(name):
    name_years = set(names[names['first_name'] == name]['year'].unique())
    missing_years = all_years - name_years
    return sorted(missing_years)

# Function to find the first year each name appears
def find_first_year(name):
    name_years = names[names['first_name'] == name]['year']
    if not name_years.empty:
        return name_years.min()
    return None

# Create a DataFrame to store the missing years and first year for each name by popularity type
results = []
for popularity_type in popular_names['popularity_type'].unique():
    type_names = popular_names[popular_names['popularity_type'] == popularity_type]
    missing_years_df = pd.DataFrame({
        'first_name': type_names['first_name'],
        'popularity_type': popularity_type,
        'missing_years': type_names['first_name'].apply(find_missing_years),
        'first_year': type_names['first_name'].apply(find_first_year)
    })
    missing_years_df['missing_count'] = missing_years_df['missing_years'].apply(len)
    results.append(missing_years_df)

# Concatenate all results into a single DataFrame
all_missing_years_df = pd.concat(results)

# Display the full results for all popularity types
all_missing_years_df[['first_name', 'popularity_type', 'first_year', 'missing_count']].sort_values(by=['popularity_type', 'missing_count'], ascending=[True, False])

The analysis of names across different popularity types revealed that:

"Classic" names like Daniel, Mary, and Richard have minimal missing years, indicating their consistent popularity over time.
"Classic" names like Charles, David , Elisabeth, James have been present during all 100 years ( 1920 - 2020)
"Trendy" names such as Dillon, Ariana, and Ariel have significant gaps, with many missing for 100 out of the 101 years, reflecting their sporadic popularity. This suggests that "Classic" names tend to have a more stable presence, while "Trendy" names experience fluctuating popularity.

# Filter the names dataframe to include only female names
female_names = names[names['sex'] == 'F']

# Group by first_name and sum the occurrences
female_name_counts = female_names.groupby('first_name')['num'].sum().reset_index()

# Sort the names by the number of occurrences in descending order
top_female_names = female_name_counts.sort_values(by='num', ascending=False)

# Display the top-ranked female names
top_female_names.head(10)

For the periode of 1920 - 2020 : The 5 higher number of female names are : 'Mary', 'Patricia', 'Elizabeth', 'Jennifer', 'Linda'

‌
‌
‌