Skip to content
Stanford GSB - CES Search Fund Task (For Review) (copy)
# Documentation of steps for Search Fund Analyst Task
# 1. Find the countries of players - derived from Master.CSV
# 2. Find the Hall of Famers data - derived from HallOfFame.CSV
# 3. Index HOF by only showing inducted HOF's
# 4. Create a new data frame with playerID, induction year, and birthCountry
# 5. Create a cumuluative function, filling out any null values to make line continuous
# 6. Change color palette
# 7. Visualize the graph using SeaBorn with year on the x axis, y as a logarthmic scale of cumulative HOF's through the years, and use country as a grouping factor
Stanford Project CSV Files (Filter only needed files)
DataFrameavailable as
df19
variable
SELECT * FROM 'HallOfFame.csv';
import pandas as pd
master = pd.read_csv('Master.csv')
master
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Read the 'HallOfFame.csv' file into a dataframe
hall_of_fame_df = pd.read_csv('HallOfFame.csv')
# Create a new dataframe with only inducted players
inducted_hof_df = hall_of_fame_df[hall_of_fame_df['inducted'] == 'Y']
# Load the Master.csv file into a DataFrame
df_master = pd.read_csv('Master.csv')
# Select only playerID and birthCountry from df_master
df_selected = df_master[['playerID', 'birthCountry']]
# Merge df_selected and inducted_hof_df by "playerID"
merged_df = df_selected.merge(inducted_hof_df, on="playerID")
# Create a new dataframe with "playerID", "birthCountry", and "yearid" columns from merged_df
new_df = merged_df[["playerID", "birthCountry", "yearid"]]
# Count the occurrences of each combination of 'birthCountry' and 'yearid'
group_count = new_df.groupby(['birthCountry', 'yearid']).size().reset_index(name='Count')
# Sort the DataFrame by 'yearid' and 'birthCountry'
sorted_df = group_count.sort_values(by=['yearid', 'birthCountry'])
# Fill missing combinations with 0 before calculating the cumulative sum
all_combinations = pd.MultiIndex.from_product([sorted_df['birthCountry'].unique(), sorted_df['yearid'].unique()], names=['birthCountry', 'yearid'])
sorted_df_filled = sorted_df.set_index(['birthCountry', 'yearid']).reindex(all_combinations, fill_value=0).reset_index()
# Sort the filled DataFrame
sorted_df_filled.sort_values(by=['yearid', 'birthCountry'], inplace=True)
# Calculate the cumulative sum for each 'birthCountry' and 'yearid'
sorted_df_filled['Cumulative_Sum'] = sorted_df_filled.groupby('birthCountry')['Count'].cumsum()
# Plot the cumulative sum with straight lines
plt.figure(figsize=(12, 6))
# Define custom colors for each birthCountry
country_palette = {"CAN": "#add8e6",
"Cuba": "#3498db",
"D.R.": "#90ee90",
"Germany": "#008000",
"Netherlands": "#ffa500",
"P.R.": "#e74c3c",
"Panama": "#ffff00",
"United Kingdom": "#ff4500",
"USA": "#b19cd9",
"Venezuela": "#6a5acd"
}
# Graph
sns.lineplot(data=sorted_df_filled, x='yearid', y='Cumulative_Sum', hue='birthCountry', marker='', drawstyle='steps-post', linewidth=2, palette=country_palette)
plt.yscale('log') # Set the y-axis to logarithmic scale
plt.title('Hall of Fame by Country Through The Years (Logarithmic Scale)')
plt.xlabel('Year')
plt.ylabel('Cumulative HOF (Log Scale)')
# Plot the data with the specified palette
plt.legend(title='Country', bbox_to_anchor=(1, 1))
plt.show()
Hidden code