Spotify Music Data

This dataset consists of ~600 songs that were in the top songs of the year from 2010 to 2019 (as measured by Billboard). You can explore interesting song data pulled from Spotify such as the beats per minute, amount of spoken words, loudness, and energy of every song.

Not sure where to begin? Scroll to the bottom to find challenges!

import pandas as pd
spotify = pd.read_csv("spotify_top_music.csv", index_col=0)
print(spotify.shape)
spotify.head(100)

genre_counts = spotify['top genre'].value_counts()
genre_counts.head(5)

import matplotlib.pyplot as plt

genre_counts.head(5).plot(kind='bar')
plt.title("Top 5 Genres from 2010 to 2019")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.show()

artist_counts = spotify['artist'].value_counts()
artist_counts.head(5)

import matplotlib.pyplot as plt

artist_counts.head(5).plot(kind='bar')
plt.title("Top 5 Artist from 2010 to 2019")
plt.xlabel("Artist")
plt.ylabel("Count")
plt.show()

annual_pop = spotify.groupby('year')['pop'].mean()

import matplotlib.pyplot as plt

annual_pop.plot(marker='o')
plt.title("Average Track Popularity by Year")
plt.xlabel("Year")
plt.ylabel("Average Popularity")
plt.xticks(annual_pop.index)

plt.grid(alpha=0.3)
plt.show()

import matplotlib.pyplot as plt

features = ['bpm', 'dnce', 'nrgy']
annual_feats = spotify.groupby('year')[features].mean()

annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()

import matplotlib.pyplot as plt

features = ['live', 'acous']
annual_feats = spotify.groupby('year')[features].mean()

annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()

import matplotlib.pyplot as plt

features = ['val', 'spch']
annual_feats = spotify.groupby('year')[features].mean()

annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()

import matplotlib.pyplot as plt

spotify['dur_s'] = spotify['dur'] / 1000

annual_duration = spotify.groupby('year')['dur_s'].sum()

annual_duration.plot(kind='bar')
plt.title("Total Duration of Top Tracks by Year (seconds)")
plt.xlabel("Year")
plt.ylabel("Total Seconds")
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

spotify = pd.read_csv('spotify_top_music.csv')

X = spotify.iloc[:, 3:14]
y = spotify['top genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

TOP_N = 3

labels = sorted(y.unique()) 
metric = 'f1-score'

report_df = pd.DataFrame(
 classification_report(
 y_test, y_pred,
 labels = labels,
 target_names = labels,
 output_dict = True,
 zero_division= 0
 )
).T


top_df = (
 report_df.loc[labels, [metric, 'precision', 'recall']]
 .sort_values(metric, ascending=False)
 .head(TOP_N)
 .round(3)
)

print(f"Top {TOP_N} genres by {metric}:")
display(top_df)

fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(top_df.index, top_df[metric]) 
ax.set_title(f"Top {TOP_N} Genres by {metric.capitalize()}")
ax.set_ylabel(metric.capitalize())
ax.set_xlabel("Genre")
ax.set_xticklabels(top_df.index, rotation=45, ha='right')
plt.tight_layout()
plt.show()

Data dictionary

	Variable	Explanation
0	title	The title of the song
1	artist	The artist of the song
2	top genre	The genre of the song
3	year	The year the song was in the Billboard
4	bpm	Beats per minute: the tempo of the song
5	nrgy	The energy of the song: higher values mean more energetic (fast, loud)
6	dnce	The danceability of the song: higher values mean it's easier to dance to
7	dB	Decibel: the loudness of the song
8	live	Liveness: likeliness the song was recorded with a live audience
9	val	Valence: higher values mean a more positive sound (happy, cheerful)
10	dur	The duration of the song
11	acous	The acousticness of the song: likeliness the song is acoustic
12	spch	Speechines: higher values mean more spoken words
13	pop	Popularity: higher values mean more popular

Source of dataset.

Spotify Music Data

.mfe-app-workspace-kj242g{position:absolute;top:-8px;}.mfe-app-workspace-11ezf91{display:inline-block;}.mfe-app-workspace-11ezf91:hover .Anchor__copyLink{visibility:visible;}Spotify Music Data

Data dictionary

Spotify Music Data