In [1]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import librosa
import seaborn as sns

import matplotlib.pyplot as plt
from librosa.display import specshow

import IPython.display as ipd
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from scipy import signal
from scipy.stats import entropy
from numpy.linalg import norm
from scipy.spatial.distance import pdist
from scipy.stats import norm
from umap import UMAP
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

%matplotlib inline
In [2]:
def JSD(P, Q):
    _P = P / np.linalg.norm(P, ord=1)
    _Q = Q / np.linalg.norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

def jaccard(x, y):
    return 1-np.minimum(x, y).sum()


In the previous Notebook, we tried to directly use Neural Network on the full FFT. This gaves bad result. Then we tried to slice the music is 0.5s pieces and create a Covariance Matrix for both pieces and feed this to a RNN. this gave better result but still low (45% of accuracy with only 8 classes). Now we will explore another way to classify song with more simple model.

The first thing is to extract frequency histogram for each beats. This can be used like we do with SIFT descriptor. Then we can do a clustering of those features to do after a BoW of audio features for each samples. This after can be proceed like proceed with text data.


As it is well explained on this video, there is another way to extract music features which is the Constant Q Transform. This is what we will is in correlation with beats to slice every song. First let's look at it on one example.

In [3]:
base_song_path = "fma_small/000/000005.mp3"
In [4]:
y, sr = librosa.load(base_song_path, sr=None, mono = True)

if sr != 44100:
    y = librosa.resample(y, sr, 44100)
    sr = 44100
D = librosa.stft(y, 
                 n_fft = 1024, 
                 hop_length = 1024, # hop_length = 20 ms
                 win_length = 1024,
                 window = signal.tukey(1024),
tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=1024)
beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=1024)
cqt = np.abs(librosa.cqt(y, sr=sr, hop_length=1024))
subseg = librosa.segment.subsegment(cqt, beats, n_segments=1)
subseg_t = librosa.frames_to_time(subseg, sr=sr, hop_length=1024)
In [5]:
specshow(librosa.amplitude_to_db(np.abs(D)), y_axis='hz', x_axis='time')
lims = plt.gca().get_ylim()
plt.vlines(beat_times, lims[0], lims[1], color='lime', alpha=0.9, linewidth=2, label='Beats')
plt.title('FFT + Beat and sub-beat markers', fontsize=25)