import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
Importing the data set
= pd.read_csv("Input/spotify-2023.csv", encoding = 'latin-1')
df df.head()
track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | ... | bpm | key | mode | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703 | 43 | ... | 125 | B | Major | 80 | 89 | 83 | 31 | 0 | 8 | 4 |
1 | LALA | Myke Towers | 1 | 2023 | 3 | 23 | 1474 | 48 | 133716286 | 48 | ... | 92 | C# | Major | 71 | 61 | 74 | 7 | 0 | 10 | 4 |
2 | vampire | Olivia Rodrigo | 1 | 2023 | 6 | 30 | 1397 | 113 | 140003974 | 94 | ... | 138 | F | Major | 51 | 32 | 53 | 17 | 0 | 31 | 6 |
3 | Cruel Summer | Taylor Swift | 1 | 2019 | 8 | 23 | 7858 | 100 | 800840817 | 116 | ... | 170 | A | Major | 55 | 58 | 72 | 11 | 0 | 11 | 15 |
4 | WHERE SHE GOES | Bad Bunny | 1 | 2023 | 5 | 18 | 3133 | 50 | 303236322 | 84 | ... | 144 | A | Minor | 65 | 23 | 80 | 14 | 63 | 11 | 6 |
5 rows × 24 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track_name 953 non-null object
1 artist(s)_name 953 non-null object
2 artist_count 953 non-null int64
3 released_year 953 non-null int64
4 released_month 953 non-null int64
5 released_day 953 non-null int64
6 in_spotify_playlists 953 non-null int64
7 in_spotify_charts 953 non-null int64
8 streams 953 non-null object
9 in_apple_playlists 953 non-null int64
10 in_apple_charts 953 non-null int64
11 in_deezer_playlists 953 non-null object
12 in_deezer_charts 953 non-null int64
13 in_shazam_charts 903 non-null object
14 bpm 953 non-null int64
15 key 858 non-null object
16 mode 953 non-null object
17 danceability_% 953 non-null int64
18 valence_% 953 non-null int64
19 energy_% 953 non-null int64
20 acousticness_% 953 non-null int64
21 instrumentalness_% 953 non-null int64
22 liveness_% 953 non-null int64
23 speechiness_% 953 non-null int64
dtypes: int64(17), object(7)
memory usage: 178.8+ KB
sum() df.isna().
track_name 0
artist(s)_name 0
artist_count 0
released_year 0
released_month 0
released_day 0
in_spotify_playlists 0
in_spotify_charts 0
streams 0
in_apple_playlists 0
in_apple_charts 0
in_deezer_playlists 0
in_deezer_charts 0
in_shazam_charts 50
bpm 0
key 95
mode 0
danceability_% 0
valence_% 0
energy_% 0
acousticness_% 0
instrumentalness_% 0
liveness_% 0
speechiness_% 0
dtype: int64
df.shape
(953, 24)
df.describe()
artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | in_apple_playlists | in_apple_charts | in_deezer_charts | bpm | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.00000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 |
mean | 1.556139 | 2018.238195 | 6.033578 | 13.930745 | 5200.124869 | 12.009444 | 67.812172 | 51.908709 | 2.666317 | 122.540399 | 66.96957 | 51.431270 | 64.279119 | 27.057712 | 1.581322 | 18.213012 | 10.131165 |
std | 0.893044 | 11.116218 | 3.566435 | 9.201949 | 7897.608990 | 19.575992 | 86.441493 | 50.630241 | 6.035599 | 28.057802 | 14.63061 | 23.480632 | 16.550526 | 25.996077 | 8.409800 | 13.711223 | 9.912888 |
min | 1.000000 | 1930.000000 | 1.000000 | 1.000000 | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 65.000000 | 23.00000 | 4.000000 | 9.000000 | 0.000000 | 0.000000 | 3.000000 | 2.000000 |
25% | 1.000000 | 2020.000000 | 3.000000 | 6.000000 | 875.000000 | 0.000000 | 13.000000 | 7.000000 | 0.000000 | 100.000000 | 57.00000 | 32.000000 | 53.000000 | 6.000000 | 0.000000 | 10.000000 | 4.000000 |
50% | 1.000000 | 2022.000000 | 6.000000 | 13.000000 | 2224.000000 | 3.000000 | 34.000000 | 38.000000 | 0.000000 | 121.000000 | 69.00000 | 51.000000 | 66.000000 | 18.000000 | 0.000000 | 12.000000 | 6.000000 |
75% | 2.000000 | 2022.000000 | 9.000000 | 22.000000 | 5542.000000 | 16.000000 | 88.000000 | 87.000000 | 2.000000 | 140.000000 | 78.00000 | 70.000000 | 77.000000 | 43.000000 | 0.000000 | 24.000000 | 11.000000 |
max | 8.000000 | 2023.000000 | 12.000000 | 31.000000 | 52898.000000 | 147.000000 | 672.000000 | 275.000000 | 58.000000 | 206.000000 | 96.00000 | 97.000000 | 97.000000 | 97.000000 | 91.000000 | 97.000000 | 64.000000 |
= plt.subplots(figsize=(10,10))
fig, ax =True), annot=True, linewidth=.5, ax=ax)
sns.heatmap(df.corr(numeric_only plt.show()
Converting column types
'streams'] = pd.to_numeric(df['streams'], errors= 'coerce')
df['in_deezer_playlists'] = pd.to_numeric(df['in_deezer_playlists'], errors= 'coerce')
df['in_shazam_charts'] = pd.to_numeric(df['in_shazam_charts'],errors='coerce') df[
Handling missing values
'key'] = df['key'].fillna('Unknown')
df['in_shazam_charts'] = df['in_shazam_charts'].fillna(0)
df[
#Fill NaNs with zero or another appropriate value
0, inplace= True)
df.fillna(
#Ensure all columns have finite value
float('inf'), float('-inf')], 0, inplace=True) df.replace([
Dataset for the songs released in 2023
#filtering data according to year 2023
= df[df['released_year']==2023]
year_2023 year_2023.head()
track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | ... | bpm | key | mode | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703.0 | 43 | ... | 125 | B | Major | 80 | 89 | 83 | 31 | 0 | 8 | 4 |
1 | LALA | Myke Towers | 1 | 2023 | 3 | 23 | 1474 | 48 | 133716286.0 | 48 | ... | 92 | C# | Major | 71 | 61 | 74 | 7 | 0 | 10 | 4 |
2 | vampire | Olivia Rodrigo | 1 | 2023 | 6 | 30 | 1397 | 113 | 140003974.0 | 94 | ... | 138 | F | Major | 51 | 32 | 53 | 17 | 0 | 31 | 6 |
4 | WHERE SHE GOES | Bad Bunny | 1 | 2023 | 5 | 18 | 3133 | 50 | 303236322.0 | 84 | ... | 144 | A | Minor | 65 | 23 | 80 | 14 | 63 | 11 | 6 |
5 | Sprinter | Dave, Central Cee | 2 | 2023 | 6 | 1 | 2186 | 91 | 183706234.0 | 67 | ... | 141 | C# | Major | 92 | 66 | 58 | 19 | 0 | 8 | 24 |
5 rows × 24 columns
Top 5 songs and their artists
= df[['track_name','artist(s)_name','streams']].sort_values(by='streams',ascending=False).head()
top_songs_and_artists top_songs_and_artists
track_name | artist(s)_name | streams | |
---|---|---|---|
55 | Blinding Lights | The Weeknd | 3.703895e+09 |
179 | Shape of You | Ed Sheeran | 3.562544e+09 |
86 | Someone You Loved | Lewis Capaldi | 2.887242e+09 |
620 | Dance Monkey | Tones and I | 2.864792e+09 |
41 | Sunflower - Spider-Man: Into the Spider-Verse | Post Malone, Swae Lee | 2.808097e+09 |
Creating Plot
#Set the style
set(style="whitegrid")
sns.= plt.subplots(figsize=(10, 6))
fig, ax = sns.barplot(
bars = 'streams',
x = 'track_name',
y ='track_name' ,
hue= top_songs_and_artists,
data = "viridis",
palette= 'black'
edgecolor
)
# Add annotations
for bar in bars.patches:
plt.annotate(format(bar.get_width(), ','),
+ bar.get_height() / 2),
(bar.get_width(), bar.get_y() = 'center',
ha = 'center',
va =(5,0),
xytext='offset points'
textcoords
)
# Set titles and labels
"Top 5 Songs and The Artists", fontsize = 16, weight = 'bold')
ax.set_title("Number of Streams", fontsize=14)
ax.set_xlabel("Tracks' Names", fontsize= 14)
ax.set_ylabel(
#Remove the top and right spines
= True, bottom = True)
sns.despine(left #show the plot
plt.show()
Creating Interractive Plot
import plotly.express as px
#Create the plot
= px.bar(
fig
top_songs_and_artists,='streams',
x='track_name',
y= 'streams',
text = 'streams',
color ='viridis',
color_continuous_scale="Top 5 Songs and The Artists",
title
)
#Update the layout
fig.update_layout(="Number of Streams",
xaxis_title= "Tracks' Names",
yaxis_title=22,
title_font_size="Arial",
title_font_family=dict(showgrid=False),
xaxis=dict(showgrid=False)
yaxis
)
#Update the traces
='%{text:,}', textposition='outside')
fig.update_traces(texttemplate
#show the plot
fig.show()
Numeber of songs over year on Spotify
= df.groupby('released_year')['track_name'].count()
year_song year_song
released_year
1930 1
1942 1
1946 1
1950 1
1952 1
1957 2
1958 3
1959 2
1963 3
1968 1
1970 2
1971 1
1973 1
1975 2
1979 1
1982 2
1983 1
1984 4
1985 2
1986 2
1987 1
1991 2
1992 1
1994 1
1995 2
1996 1
1997 1
1998 1
1999 5
2000 4
2002 6
2003 2
2004 4
2005 1
2007 1
2008 2
2010 7
2011 10
2012 10
2013 13
2014 13
2015 11
2016 18
2017 23
2018 10
2019 36
2020 37
2021 119
2022 402
2023 175
Name: track_name, dtype: int64
#Set the style
set(style = "whitegrid")
sns.
# First plot: Number of songs over years
= plt.subplots(figsize=(12,6))
fig, ax1 ='o', linestyle='-', color='green',label='All years')
ax1.plot(year_song.index,year_song.values,marker
"Released Years", fontsize=14),
ax1.set_xlabel("Number of Tracks", fontsize= 14),
ax1.set_ylabel("Released Songs Over Years on Spotify", fontsize=16, weight='bold')
ax1.set_title(
ax1.legend()True)
ax1.grid(
# show the plot
plt.show()
= df[(df['released_year']>=2018) & (df['released_year']<= 2023)]
year1=year1.groupby('released_year')['track_name'].count()
year2 year2
released_year
2018 10
2019 36
2020 37
2021 119
2022 402
2023 175
Name: track_name, dtype: int64
= plt.subplots(figsize=(12,6))
fig, ax2 ='o', linestyle='-', color='green',label='2018-2023')
ax2.plot(year2.index, year2.values,marker
"Released Years", fontsize=14),
ax2.set_xlabel("Number of Tracks", fontsize= 14),
ax2.set_ylabel("Released Songs Over Past 6 Years on Spotify", fontsize=16, weight='bold')
ax2.set_title(
ax2.legend()True)
ax2.grid(
# show the plot
plt.show()
Interractive plots
#First plot: Number of songs over years
= px.line(
fig1
year_song.reset_index(),= 'released_year',
x = 'track_name',
y = 'Released Songs Over years on Spotify',
title= {'released_year': 'Released Years', 'track_name': 'Number of Tracks'}
labels
)
='lines+markers',line_color="green")
fig1.update_traces(mode=22, title_font_family="Arial")
fig1.update_layout(title_font_size
# Second plot: Number of songs over the past 6 years
= px.line(
fig2
year2.reset_index(),= 'released_year',
x = 'track_name',
y = 'Released Songs Over the past 6 years on Spotify',
title= {'released_year': 'Released Years', 'track_name': 'Number of Tracks'}
labels
)
='lines+markers',line_color="green")
fig2.update_traces(mode=22, title_font_family="Arial")
fig2.update_layout(title_font_size
#show the plots
fig1.show() fig2.show()
Playlist vs streams
# Create subplots
= plt.subplots(1,2, figsize = (16,6), sharey=True)
fig, axs
#Scatter plots for Spotify Playlists vs Streams
0].scatter(df['in_spotify_playlists'],df['streams'],color='blue',alpha=0.5)
axs[0].set_xlabel('Number of Spotify Playlists')
axs[0].set_ylabel('Streams')
axs[0].set_title('Spotify Playlists vs Streams')
axs[0].grid(True)
axs[
#Scatter plot for Apple Playlists vs Streams
1].scatter(df['in_apple_playlists'],df['streams'],color='green',alpha=0.5)
axs[1].set_xlabel('Number of Apple Playlists')
axs[1].set_ylabel('Streams')
axs[1].set_title('Apple Playlists vs Streams')
axs[1].grid(True)
axs[
#Set a common title
'Number of Playlists vs Streams (Spotify vs Apple Music)', fontsize=16)
fig.suptitle( plt.show()
Analyzing features
#Select the columns for analysis
= ['danceability_%','valence_%','energy_%','acousticness_%','instrumentalness_%','liveness_%','speechiness_%']
features ='kde', height= 1.75)
sns.pairplot(df[features],diag_kind'Pairwise Relationships between Audio Feature',y=1.02)
plt.suptitle( plt.show()
Creating radar Chart for 1st song
from math import pi
def create_radar_chart(df, row, title):
= list(df[features].columns)
categories = df[features].loc[row].values.flatten().tolist()
values += values[:1]
values
= [n/ float(len(categories)) * 2 * pi for n in range(len(categories))]
angles += angles[:1]
angles
= plt.subplot(111, polar=True)
ax -1],categories,color='grey',size=8)
plt.xticks(angles[:=1,linestyle='solid')
ax.plot(angles,values,linewidth'b',alpha=0.1)
ax.fill(angles,values,=11,color='b',y=1.1)
plt.title(title,size
=(6,6))
plt.figure(figsize0,df['track_name'].iloc[0])
create_radar_chart(df, plt.show()
Creating clusters
from sklearn.cluster import KMeans
= KMeans(n_clusters=5, random_state=0, n_init=10).fit(df[features])
kmeans 'cluster']= kmeans.labels_
df[
#plot clusters
=(12,8))
plt.figure(figsize='danceability_%', y='energy_%', hue='cluster', palette='viridis', data=df, alpha=0.7)
sns.scatterplot(x'Cluster Analysis of Audio Features')
plt.title('Danceability (%)')
plt.xlabel('Energy (%)')
plt.ylabel( plt.show()
Trends of the future over time
= ['danceability_%','valence_%','energy_%','acousticness_%','instrumentalness_%','liveness_%','speechiness_%']
audio_features = df.groupby('released_year')[audio_features].mean().reset_index()
trends
# Plotting trends over time
= plt.subplots(len(audio_features),1, figsize=(14,20),sharex=True)
fig, ax for i, feature in enumerate(audio_features):
='released_year', y = feature, data=trends, ax =ax[i])
sns.lineplot(xf'Trends of {feature.replace("_%","")} over Years')
ax[i].set_title("_%"," (%)"))
ax[i].set_ylabel(feature.replace(
'Released Year')
plt.xlabel(
plt.tight_layout() plt.show()
Heatmap for frequency of the chart appearance
= df[['in_spotify_charts','in_apple_charts','in_deezer_charts','in_shazam_charts']]
heatmap_data
=(10,8))
plt.figure(figsize=True,cmap='coolwarm',fmt= '.2f')
sns.heatmap(heatmap_data.corr(),annot'Correlation Heatmap of Chart Apperances')
plt.title( plt.show()
Max Difference between danceability and energy
= df.groupby('released_year').agg({'danceability_%': 'mean', 'energy_%':'mean'}).reset_index()
yearly_data
= plt.subplots(figsize=(10,6))
fig, ax
#Plot danceability and energy as plot lines
'released_year'], yearly_data['danceability_%'], label='Danceability', color = 'blue')
ax.plot(yearly_data['released_year'], yearly_data['energy_%'], label='Energy',color = 'salmon')
ax.plot(yearly_data[
#Highlight the maximum difference
'difference'] = abs(yearly_data['danceability_%'] - yearly_data['energy_%'])
yearly_data[= yearly_data.loc[yearly_data['difference'].idxmax()]
max_diff_year
# Annotations with text in the bottom left corner
f"Max diff: {max_diff_year['difference']:.2f}%",
ax.annotate(=(max_diff_year['released_year'],max_diff_year['danceability_%']),
xy=(0.65, 0.25),# Fractional Cordinates (0.05,0.05) for the bottom left corner
xytext= 'figure fraction',
textcoords={'arrowstyle':"->",'color':'gray'},ha ='left')
arrowprops
'Year')
ax.set_xlabel('Percentage')
ax.set_ylabel('Average Danceability and Energy by Year')
ax.set_title(
ax.legend()
plt.show()
# Scatter Plot with Trend Line: Number of Artists vs Realead Year
def scatter_plot_with_trendline(df, x_col,y_col, title, xlabel, ylabel):
=(10,6))
plt.figure(figsize=df[x_col],y=df[y_col],scatter_kws={'alpha':0.5, 's':10}, line_kws={'color':'red'})
sns.regplot(x
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)True)
plt.grid(
plt.show()
'released_year', 'artist_count', 'Number of Artist vs Released Year', 'Release Year', 'Number of Artists') scatter_plot_with_trendline(df,
Max collaborations
# Add a new column to indicate if the track is a collaboration (more than one artist)
'is_collaboration']= df['artist_count'] > 1
df[
# Filter the DataFrame to include only the years between 1990 and 2023
= df[(df['released_year']>= 1995) & (df['released_year'] <=2023)]
filtered_df
# Group by released year and count the number of collaboration and solo tracks
= filtered_df.groupby('released_year')['is_collaboration'].sum().reset_index()
yearly_collaborations = filtered_df.groupby('released_year')['is_collaboration'].count().reset_index()
yearly_solo_tracks 'is_collaboration'] -= yearly_collaborations['is_collaboration']
yearly_solo_tracks[
# Combine the data into a single DataFrame for plotting
= pd.DataFrame({'Year':yearly_collaborations['released_year'],
yearly_data 'Collaborations': yearly_collaborations['is_collaboration'],
'Solo Tracks': yearly_solo_tracks['is_collaboration']
})# Plot
=(12,6))
plt.figure(figsize'Year'], yearly_data['Solo Tracks'], yearly_data['Collaborations'], labels= ['Solo Tracks','Collaborations'],colors = ['skyblue','salmon'])
plt.stackplot(yearly_data['Release Year')
plt.xlabel('Number of Tracks')
plt.ylabel('Number of Solo Tracks and Collaborations by Released Year(1990-2023)')
plt.title(= 'upper left')
plt.legend(loc = 'y', linestyle = '--', alpha = 0.7)
plt.grid(axis
# Highlight the year with the most collaborations
= yearly_data.loc[yearly_data['Collaborations'].idxmax()]
max_collab_year
plt.annotate(f"Max collaborations: {max_collab_year['Collaborations']}",
= (max_collab_year['Year'], max_collab_year['Collaborations']),
xy = (max_collab_year['Year'], max_collab_year['Collaborations'] + 240),
xytext= dict(facecolor='black', arrowstyle = '->'), ha = 'center'
arrowprops
)
plt.show()
df.head()
track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | ... | mode | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | cluster | is_collaboration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703.0 | 43 | ... | Major | 80 | 89 | 83 | 31 | 0 | 8 | 4 | 3 | True |
1 | LALA | Myke Towers | 1 | 2023 | 3 | 23 | 1474 | 48 | 133716286.0 | 48 | ... | Major | 71 | 61 | 74 | 7 | 0 | 10 | 4 | 3 | False |
2 | vampire | Olivia Rodrigo | 1 | 2023 | 6 | 30 | 1397 | 113 | 140003974.0 | 94 | ... | Major | 51 | 32 | 53 | 17 | 0 | 31 | 6 | 1 | False |
3 | Cruel Summer | Taylor Swift | 1 | 2019 | 8 | 23 | 7858 | 100 | 800840817.0 | 116 | ... | Major | 55 | 58 | 72 | 11 | 0 | 11 | 15 | 3 | False |
4 | WHERE SHE GOES | Bad Bunny | 1 | 2023 | 5 | 18 | 3133 | 50 | 303236322.0 | 84 | ... | Minor | 65 | 23 | 80 | 14 | 63 | 11 | 6 | 1 | False |
5 rows × 26 columns
Streams by Music Mode
=(10, 6))
plt.figure(figsize= "whitegrid")
sns.set_theme(style
#Create violin plot with customizations
=df, x="mode", y="streams",hue = "mode", palette="Set3", inner="quartile",legend=False)
sns.violinplot(data
#Adding titles and labels
'Streams by Music Mode')
plt.title('Mode')
plt.xlabel('Sterams')
plt.ylabel(
#Adjusting layout
plt.tight_layout()
#Show plot
plt.show()