Clustering Springsteen albums with spotifyr

The spotifyr package is great and let’s you explore various aspects of music such as tempo, danceability and valence. In this post, we’ll explore Bruce Springsteen albums in terms of their similarities and dissimilarities.

# devtools::install_github('charlie86/spotifyr')

library(spotifyr)
library(tidyverse)
library(magrittr)
library(ggridges)
library(ggcorrplot)
library(viridisLite)
library(factoextra)
library(ggiraphExtra)

Getting the data is easy with the get_artist_audio_features() function. Here, we’ll load it from a csv file and have a look.

# df <- get_artist_audio_features(artist = "bruce springsteen")

df <- read_csv("https://raw.github.com/peerchristensen/Springsteen_album_clusters/master/springsteen_albums.csv")

glimpse(df)
## Observations: 537
## Variables: 31
## $ artist_name            <chr> "Bruce Springsteen", "Bruce Springsteen...
## $ artist_uri             <chr> "3eqjTLE0HfPfh78zjh6TqT", "3eqjTLE0HfPf...
## $ album_uri              <chr> "0PMasrHdpaoIRuHuhHp72O", "0PMasrHdpaoI...
## $ album_name             <chr> "Born In The U.S.A.", "Born In The U.S....
## $ album_img              <chr> "https://i.scdn.co/image/d002b63ceb5658...
## $ album_type             <chr> "album", "album", "album", "album", "al...
## $ is_collaboration       <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ album_release_date     <chr> "1984-06-04", "1984-06-04", "1984-06-04...
## $ album_release_year     <date> 1984-06-04, 1984-06-04, 1984-06-04, 19...
## $ album_popularity       <dbl> 76, 76, 76, 76, 76, 76, 76, 76, 76, 76,...
## $ track_name             <chr> "Born in the U.S.A.", "Cover Me", "Darl...
## $ track_uri              <chr> "0dOg1ySSI7NkpAe89Zo0b9", "4U7NhC2rQTAh...
## $ track_number           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ disc_number            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ danceability           <dbl> 0.398, 0.535, 0.536, 0.429, 0.544, 0.62...
## $ energy                 <dbl> 0.952, 0.884, 0.982, 0.949, 0.762, 0.44...
## $ key                    <chr> "E", "A", "G", "C", "A#", "C#", "F", "A...
## $ loudness               <dbl> -6.042, -5.499, -4.674, -5.295, -7.289,...
## $ mode                   <chr> "major", "minor", "major", "major", "ma...
## $ speechiness            <dbl> 0.0610, 0.0407, 0.0389, 0.0458, 0.0382,...
## $ acousticness           <dbl> 0.000373, 0.001880, 0.014100, 0.084200,...
## $ instrumentalness       <dbl> 7.75e-05, 1.26e-03, 3.67e-05, 0.00e+00,...
## $ liveness               <dbl> 0.1000, 0.1400, 0.2740, 0.1540, 0.0740,...
## $ valence                <dbl> 0.584, 0.796, 0.963, 0.967, 0.473, 0.86...
## $ tempo                  <dbl> 122.093, 120.555, 119.201, 184.286, 120...
## $ duration_ms            <dbl> 278680, 205987, 288027, 192267, 215427,...
## $ time_signature         <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
## $ key_mode               <chr> "E major", "A minor", "G major", "C maj...
## $ track_popularity       <dbl> 72, 51, 45, 47, 49, 71, 50, 47, 53, 62,...
## $ track_preview_url      <chr> "https://p.scdn.co/mp3-preview/3b6a5b91...
## $ track_open_spotify_url <chr> "https://open.spotify.com/track/0dOg1yS...

We’ll need to do just a little bit of cleaning and remove a few non-studio albums.

# some albums only have one song, some are alternate versions

remove_albums <- c("Greatest Hits",
                   "Hammersmith Odeon, London 75",
                   "The Essential Bruce Springsteen (Bonus Disc)",
                   "The Ties That Bind: The River Collection",
                   "Chapter and Verse",
                   "The Promise",
                   "Tracks")

df %<>% 
  filter(!album_name %in% remove_albums,
         !grepl("live|Live",album_name)) %>%
  mutate(album_name = str_to_title(album_name))

df$album_name <- gsub(":.*","",df$album_name)
df$album_name[grepl("Innocent",df$album_name)]  <- "The Wild, The Innocent.."
df$album_name[grepl("Greetings",df$album_name)] <- "Greetings"
df$album_name[grepl("Darkness",df$album_name)]  <- "Darkness"

Let’s first inspect the five most used keys in Springsteen songs.

df                   %>% 
  select(key_mode)   %>%
  group_by(key_mode) %>%
  count()            %>%
  arrange(desc(n))   %>%
  ungroup()          %>%
  top_n(5)           %>%
  mutate(ordered = row_number()) %>%
  
  ggplot(aes(x = reorder(key_mode,desc(ordered)), y = n, fill = n)) +
  geom_col() +
  coord_flip() +
  ggtitle("Five most common keys") +
  scale_fill_viridis_c(option="B", direction = -1,guide=F) +
  theme_minimal() +
  labs(y = "n",x = "key")

As we’ve seen, spotifyr grabs a lot of interesting data from the spotify API. Let’s first look at the danceability of each album. ‘Born to Run’ appears to be the lowest in danceability, whereas ‘Tunnel of love’ is highest.

df                     %>% 
  group_by(album_name) %>%
  
  ggplot(aes(x    = danceability, 
             y    = reorder(album_name,desc(album_release_year)),
             fill = reorder(album_name,desc(album_release_year)))) +
  geom_density_ridges(colour = "snow") +
  scale_fill_viridis_d(option = "B", begin = .05, direction = -1, guide = F) +
  theme_minimal() +
  ggtitle("Danceability") +
  labs(y="album")

Let’s instead put all features in the same plot.

df %>% 
  gather(key = feature, value = measure, 
         danceability, energy, loudness, valence, tempo, acousticness) %>%
  group_by(album_name) %>%
  
  ggplot(aes(x    = measure, 
             y    = reorder(album_name,desc(album_release_year)), 
             fill = album_release_date)) +
  geom_density_ridges(rel_min_height = 0.005, legend = F, alpha = .9, size = .2, colour = "snow") +
  facet_wrap(~feature, scales = "free", ncol = 2) +
  scale_fill_viridis_d(option ="B" ,begin = .05) +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 7)) +
  labs(y = "album name") +
  ggtitle("Springsteen albums in six features",
          subtitle = "Acousticness, danceability, energy, loudness, tempo and valence") +
  guides(fill = FALSE)

It will also be interesting to visualize how strongly the individual features are correlated. It is not too surprising that energy and loudness are positively correlated, whereas acousticness is negatively correlated with energy.

sign_test <- df %>% 
  select(acousticness,danceability,energy,loudness,tempo,valence) %>%
  cor_pmat()

df  %>% 
  select(acousticness,danceability,energy,loudness,tempo,valence) %>%
  cor() %>%
  ggcorrplot(type   = "lower", 
             p.mat  = sign_test,
             colors = c(inferno(5)[2], "snow", inferno(5)[4])) +
  ggtitle("Correlations between features",
          subtitle = "Non-significant correlations marked with X")

Based on the features, we can also explore how similar the albums are in a distance matrix. In this plot, the orange colour indicates high dissimilarity or great “distance” between albums.

dfScale <- df %>%
  select(album_name,acousticness,danceability,energy,loudness,tempo,valence) %>%
  group_by(album_name) %>%
  summarise(acousticness = mean(scale(acousticness)),
            danceability = mean(scale(danceability)),
            energy       = mean(scale(energy)),
            loudness     = mean(scale(loudness)),
            tempo        = mean(scale(tempo)),
            valence      = mean(scale(valence))) %>%
  data.frame()

row.names(dfScale) <- dfScale$album_name

dfScale %<>% 
  select(-album_name) %>%
  data.frame()

df_dist <- get_dist(dfScale, stand = TRUE)

fviz_dist(df_dist,gradient = list(low = inferno(5)[2], mid = "white", high = inferno(5)[4])) +
  theme_minimal() +
  ggtitle("Distance matrix",
          subtitle  = "Similarity between albums based on all features") +
  theme(axis.text.x = element_text(hjust = 1,angle = 45),
        axis.title = element_blank())

To get a clearer picture, we can explore patterns between albums and features with radar plots from the ggiraphExtra package.

dfScale %>%
  mutate(albums = row.names(dfScale)) %>%
  ggRadar(aes(group = albums), 
        rescale = FALSE, legend.position = "none",
        size = 1, interactive = FALSE, use.label = TRUE) +
  facet_wrap(~albums) + 
  scale_y_discrete(breaks = NULL) +
  theme(axis.text.x = element_text(size = 10)) +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_viridis_d(option="B") +
  scale_colour_viridis_d(option="B")

As a final step, we’ll have a look at how albums may be grouped based on the various features using hierarchical and k-means clustering. We’ll first use the fviz_nbclust() function from the factoExtra package to calculate the optimal number of clusters. Note that there are different methods included in the function to calculate the number of clusters. By default the “silhouette” method is used.

fviz_nbclust(dfScale, hcut) +
  ggtitle("Optimal Number of Clusters: H-Clustering")

df.hc <- hclust(dist(scale(dfScale)))

fviz_dend(df.hc, k = 3,
          cex = .9,
          k_colors = inferno(10)[c(4,7)],
          color_labels_by_k = TRUE, 
          rect = TRUE) +
  ggtitle("Hierachical Clustering")

fviz_nbclust(dfScale, kmeans) +
  ggtitle("Optimal Number of Clusters: K-means Clustering")

set.seed(324789)
km.res <- kmeans(dfScale, 2, nstart = 25)

fviz_cluster(km.res, data = dfScale,
             ellipse.type = "convex",
             repel = T,
             palette = inferno(10)[c(4,6,8)],
             ggtheme = theme_minimal(),
             main = "K-means Clustering") 

comments powered by Disqus