Skip to content
1 hidden cell
Video Games Sales Data
Video Games Sales Data
This dataset contains records of popular video games in North America, Japan, Europe and other parts of the world. Every video game in this dataset has at least 100k global sales.
Not sure where to begin? Scroll to the bottom to find challenges!
Source of dataset.
import pandas as pd
sales = pd.read_csv("vgsales.csv", index_col=0)
print(sales.shape)
sales.head(100)
Data Dictionary
Column | Explanation |
---|---|
Rank | Ranking of overall sales |
Name | Name of the game |
Platform | Platform of the games release (i.e. PC,PS4, etc.) |
Year | Year the game was released in |
Genre | Genre of the game |
Publisher | Publisher of the game |
NA_Sales | Number of sales in North America (in millions) |
EU_Sales | Number of sales in Europe (in millions) |
JP_Sales | Number of sales in Japan (in millions) |
Other_Sales | Number of sales in other parts of the world (in millions) |
Global_Sales | Number of total sales (in millions) |
1 hidden cell
# Group the sales dataframe by Genre and sum the Global_Sales for each genre
genre_sales = sales.groupby('Genre')['Global_Sales'].sum().reset_index()
# Sort the genres by Global_Sales in descending order to find the most popular genres
most_popular_genres = genre_sales.sort_values(by='Global_Sales', ascending=False)
# Display the most popular genres
most_popular_genres
import pandas as pd
# Assuming 'sales' is a DataFrame that needs to be defined
# Here is an example of how you might define it. Replace this with your actual data loading code.
# sales = pd.read_csv('path_to_your_sales_data.csv')
# Group the sales dataframe by Platform and sum the Global_Sales for each platform
platform_sales = sales.groupby('Platform')['Global_Sales'].sum().reset_index()
# Sort the platforms by Global_Sales in descending order to find the platforms that sold the most games
most_popular_platforms = platform_sales.sort_values(by='Global_Sales', ascending=False).head(10)
# Display the top 10 platforms that sold the most games
most_popular_platforms
# Group the sales dataframe by Name and sum the Global_Sales for each game
game_sales = sales.groupby(['Name', 'Genre'])['Global_Sales'].sum().reset_index()
# Sort the games by Global_Sales in descending order to find the most sold games
top_10_most_sold_games = game_sales.sort_values(by='Global_Sales', ascending=False).head(10)
# Display the top 10 most sold games along with their genre
top_10_most_sold_games
# Group the sales dataframe by Publisher and sum the Global_Sales for each publisher
franchise_sales = sales.groupby(['Publisher'])['Global_Sales'].sum().reset_index()
# Filter the dataframe to include only the franchises of interest
franchises_of_interest = ['Mario', 'Grand Theft Auto', 'Call of Duty', 'Pokemon', 'Zelda', 'Tetris', 'Assassin\'s Creed', 'Sonic', 'FIFA']
filtered_franchise_sales = sales[sales['Name'].str.contains('|'.join(franchises_of_interest), case=False)]
# Group by franchise name to get the total sales for each franchise
filtered_franchise_sales['Franchise'] = filtered_franchise_sales['Name'].apply(
lambda x: next((franchise for franchise in franchises_of_interest if franchise.lower() in x.lower()), None)
)
total_franchise_sales = filtered_franchise_sales.groupby('Franchise')['Global_Sales'].sum().reset_index()
# Sort the franchises by Global_Sales in descending order
top_franchises = total_franchise_sales.sort_values(by='Global_Sales', ascending=False)
# Display the top franchises
top_franchises
import matplotlib.pyplot as plt
import seaborn as sns
# Get the top 5 genres by global sales
top_5_genres = most_popular_genres.head(5)['Genre']
# Filter the sales dataframe to include only the top 5 genres
top_5_genres_sales = sales[sales['Genre'].isin(top_5_genres)]
# Group by Genre and calculate the average sales for NA, EU, JP, Other, and Global
average_sales_top_5_genres = top_5_genres_sales.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].mean().reset_index()
# Melt the dataframe for easier plotting with seaborn
melted_average_sales = average_sales_top_5_genres.melt(id_vars='Genre', var_name='Region', value_name='Average_Sales')
# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_average_sales, x='Genre', y='Average_Sales', hue='Region')
plt.title('Average Sales for Top 5 Genres by Region')
plt.xlabel('Genre')
plt.ylabel('Average Sales (in millions)')
plt.legend(title='Region')
plt.show()
# Filter the sales dataframe to include only the last 10 years
current_year = sales['Year'].max()
last_10_years_sales = sales[sales['Year'] >= (current_year - 10)]
# Group the filtered dataframe by Publisher and sum the Global_Sales for each publisher
publisher_sales_last_10_years = last_10_years_sales.groupby('Publisher')['Global_Sales'].sum().reset_index()
# Sort the publishers by Global_Sales in descending order to rank the publishers
ranked_publishers_last_10_years = publisher_sales_last_10_years.sort_values(by='Global_Sales', ascending=False)
# Find the main game for each publisher
main_games = last_10_years_sales.loc[last_10_years_sales.groupby('Publisher')['Global_Sales'].idxmax()]
# Merge the ranked publishers with their main games
ranked_publishers_with_main_game = ranked_publishers_last_10_years.merge(main_games[['Publisher', 'Name', 'Global_Sales']], on='Publisher', suffixes=('_total', '_main_game'))
# Select the top 10 publishers
top_10_publishers_with_main_game = ranked_publishers_with_main_game.head(10)
# Display the top 10 ranked publishers with their main games
top_10_publishers_with_main_game