Skip to content
Spotify Music Data
Spotify Music Data
This dataset consists of ~600 songs that were in the top songs of the year from 2010 to 2019 (as measured by Billboard). You can explore interesting song data pulled from Spotify such as the beats per minute, amount of spoken words, loudness, and energy of every song.
Not sure where to begin? Scroll to the bottom to find challenges!
import pandas as pd
spotify = pd.read_csv("spotify_top_music.csv", index_col=0)
print(spotify.shape)
spotify.head(100)genre_counts = spotify['top genre'].value_counts()
genre_counts.head(5)
import matplotlib.pyplot as plt
genre_counts.head(5).plot(kind='bar')
plt.title("Top 5 Genres from 2010 to 2019")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.show()artist_counts = spotify['artist'].value_counts()
artist_counts.head(5)import matplotlib.pyplot as plt
artist_counts.head(5).plot(kind='bar')
plt.title("Top 5 Artist from 2010 to 2019")
plt.xlabel("Artist")
plt.ylabel("Count")
plt.show()annual_pop = spotify.groupby('year')['pop'].mean()
import matplotlib.pyplot as plt
annual_pop.plot(marker='o')
plt.title("Average Track Popularity by Year")
plt.xlabel("Year")
plt.ylabel("Average Popularity")
plt.xticks(annual_pop.index)
plt.grid(alpha=0.3)
plt.show()import matplotlib.pyplot as plt
features = ['bpm', 'dnce', 'nrgy']
annual_feats = spotify.groupby('year')[features].mean()
annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()import matplotlib.pyplot as plt
features = ['live', 'acous']
annual_feats = spotify.groupby('year')[features].mean()
annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()import matplotlib.pyplot as plt
features = ['val', 'spch']
annual_feats = spotify.groupby('year')[features].mean()
annual_feats.plot(marker='o')
plt.title("Mean Audio Features by Year")
plt.xlabel("Year")
plt.ylabel("Feature Value")
plt.xticks(annual_feats.index)
plt.legend(title="Feature")
plt.grid(alpha=0.3)
plt.show()import matplotlib.pyplot as plt
spotify['dur_s'] = spotify['dur'] / 1000
annual_duration = spotify.groupby('year')['dur_s'].sum()
annual_duration.plot(kind='bar')
plt.title("Total Duration of Top Tracks by Year (seconds)")
plt.xlabel("Year")
plt.ylabel("Total Seconds")
plt.show()import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
spotify = pd.read_csv('spotify_top_music.csv')
X = spotify.iloc[:, 3:14]
y = spotify['top genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
TOP_N = 3
labels = sorted(y.unique())
metric = 'f1-score'
report_df = pd.DataFrame(
classification_report(
y_test, y_pred,
labels = labels,
target_names = labels,
output_dict = True,
zero_division= 0
)
).T
top_df = (
report_df.loc[labels, [metric, 'precision', 'recall']]
.sort_values(metric, ascending=False)
.head(TOP_N)
.round(3)
)
print(f"Top {TOP_N} genres by {metric}:")
display(top_df)
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(top_df.index, top_df[metric])
ax.set_title(f"Top {TOP_N} Genres by {metric.capitalize()}")
ax.set_ylabel(metric.capitalize())
ax.set_xlabel("Genre")
ax.set_xticklabels(top_df.index, rotation=45, ha='right')
plt.tight_layout()
plt.show()Data dictionary
| Variable | Explanation | |
|---|---|---|
| 0 | title | The title of the song |
| 1 | artist | The artist of the song |
| 2 | top genre | The genre of the song |
| 3 | year | The year the song was in the Billboard |
| 4 | bpm | Beats per minute: the tempo of the song |
| 5 | nrgy | The energy of the song: higher values mean more energetic (fast, loud) |
| 6 | dnce | The danceability of the song: higher values mean it's easier to dance to |
| 7 | dB | Decibel: the loudness of the song |
| 8 | live | Liveness: likeliness the song was recorded with a live audience |
| 9 | val | Valence: higher values mean a more positive sound (happy, cheerful) |
| 10 | dur | The duration of the song |
| 11 | acous | The acousticness of the song: likeliness the song is acoustic |
| 12 | spch | Speechines: higher values mean more spoken words |
| 13 | pop | Popularity: higher values mean more popular |
Source of dataset.