In [1]:
import os
import glob
import numpy as np

from matplotlib import pyplot as plt

import scipy.io.wavfile as wav
from scipy.fftpack import fft
from scipy import signal

import librosa
import IPython.display as ipd

import matplotlib.pyplot as plt
from librosa.display import specshow, waveplot

from tensorflow import set_random_seed

np.random.seed(42)
set_random_seed(42)

%matplotlib inline
C:\python36\envs\machine_learning\lib\site-packages\h5py\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

Exploration

Length audio

In [2]:
cnt = []
for i, audio_path in enumerate(glob.glob("free-spoken-digit-dataset-master/recordings/*.wav")):
    filename = os.path.basename(audio_path)[:-4]

    x, sr = librosa.load(audio_path)
    
    x = librosa.to_mono(x)
    
    cnt.append(len(x) / sr)

plt.figure(figsize=(20,8))
plt.hist(cnt, bins=81)
plt.title("Duration of sounds", fontsize=20)
plt.xlabel("Time(s)", fontsize=15)
plt.ylabel("Number of audio", fontsize=15)
plt.show()

plt.figure(figsize=(20,8))
# plt.plot(samples) 
librosa.display.waveplot(x, sr=sr)
plt.title("Audio wave of a \"9\"", fontsize=20)
plt.xlabel("Frame (8000 frame/seconds)", fontsize=15)
plt.ylabel("Amplitude", fontsize=15)
plt.show()       

Scipy vs Librosa

In [3]:
baseline_audio = "free-spoken-digit-dataset-master/recordings/0_jackson_0.wav"

sample_rate, samples = wav.read(baseline_audio)

if len(samples.shape) > 1:
    samples = samples[:,0]

plt.figure(figsize=(25,12))
        
# From matplotlib
plt.subplot(1, 3, 1)
PxN, freqsN, binsN, imN = plt.specgram(samples, 
                                       NFFT = 256, 
                                       Fs=1, 
                                       Fc=0, 
                                       #detrend=mlab.detrend_none,
                                       window = signal.tukey(256),
                                       pad_to = None,
                                       noverlap = 1
                                      )
plt.ylabel("Freq")
plt.xlabel("Frame")
plt.title("Matplotlib", fontsize=15)
# print(freqsN)
# print(binsN)
# print(PxN.shape)

# scipy
frequencies, times, spectogram = signal.spectrogram(samples, 
                                                    fs=1, 
                                                    nfft=256, 
                                                    window=('tukey', 0.25),
                                                    nperseg = sample_rate//50,
                                                    noverlap = 1,
                                                    #detrend 
                                                    scaling = "density",
                                                    mode = "magnitude"
                                                   )
# print(frequencies)
# print(times)
# print(spectogram.shape)

plt.subplot(1, 3, 2)
plt.imshow(np.log(spectogram), extent=[0,spectogram.shape[1],0,spectogram.shape[0]], aspect='auto')
plt.ylabel("Freq")
plt.xlabel("Frame")
plt.title("Scipy", fontsize=15)

# Librosa
y, sr = librosa.load(baseline_audio, sr=None)
y = librosa.to_mono(y)

# np.abs(D[f, t]) is the magnitude of frequency bin f at frame t
# np.angle(D[f, t]) is the phase of frequency bin f at frame t
D = librosa.stft(y, 
                 n_fft = 256, 
                 hop_length = 128, # hop_length = 20 ms
                 win_length = 256,
                 window = signal.tukey(256),
                )  
# print(D.shape)

plt.subplot(1, 3, 3)
# plt.imshow(np.log(np.abs(D)), extent=[0, D.shape[1], 0, D.shape[0]], aspect='auto')
Xdb = librosa.amplitude_to_db(abs(D))
librosa.display.specshow(Xdb, sr=sample_rate, x_axis='time', y_axis='hz')
plt.ylabel("Freq")
plt.xlabel("Frame")
plt.title("Librosa", fontsize=15)

plt.show()

#######################################

# S = np.abs(D)
# comps, acts = librosa.decompose.decompose(S, n_components=8)
        
# plt.figure(figsize=(20,6))

# plt.subplot(1, 2, 1)
# librosa.display.specshow(comps, y_axis='log', x_axis='time')
# plt.colorbar(format='%+2.0f dB')

# plt.subplot(1, 2, 2)
# librosa.display.specshow(acts, x_axis='time')
# plt.colorbar(format='%+2.0f dB')

# plt.show()

# #######################################

# H, P = librosa.decompose.hpss(D, margin=3.0)
# R = D - (H + P)
        
# plt.figure(figsize=(20,20))

# plt.subplot(3, 1, 1)
# specshow(librosa.amplitude_to_db(D,ref=np.max),y_axis='log', x_axis='time')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Full power spectrogram')

# plt.subplot(3, 1, 2)
# specshow(librosa.amplitude_to_db(H,ref=np.max),y_axis='log')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Harmonic power spectrogram')

# plt.subplot(3, 1, 3)
# specshow(librosa.amplitude_to_db(P,ref=np.max),y_axis='log')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Percussive power spectrogram')

# plt.tight_layout()
# plt.show()

####################################

We can play more with Scipy on differents values, this library seems more suitable for the future.

Decomposition with Scipy

From the duration we saw previously, we have only few sounds longer than 1s. Let's computer the FFT on every sounds but after extended them to 1s (of cut them at 1s).

In [4]:
spectrogram_dir = "datas/spectrograms"
In [5]:
for i, audio_path in enumerate(glob.glob("free-spoken-digit-dataset-master/recordings/*.wav")):
    filename = os.path.basename(audio_path)[:-4]

    y, sr = librosa.load(audio_path, sr=None)
    y = librosa.to_mono(y)
    
    if sr != 8000:
        y = librosa.resample(y, sr, 8000)
    
    new_sound = np.zeros((sr, ))
    l = len(y)
    if l > sr:
        new_sound = y[:8000]
    else:
        new_sound[:l] = y
            
    frequencies, times, spectogram = signal.spectrogram(new_sound,                                                     
                                                        fs=1, 
                                                        nfft=256, 
                                                        window=('tukey', 0.25),
                                                        nperseg = sr//40,
                                                        noverlap = 2,
                                                        #detrend 
                                                        scaling = "density",
                                                        mode = "magnitude")
    if i == 0:
        np.save(os.path.join(spectrogram_dir,"meta", "frequency.npy"), frequencies)
        np.save(os.path.join(spectrogram_dir,"meta", "times.npy"), times)
    
    np.save(os.path.join(spectrogram_dir, filename + ".npy"), spectogram)
In [6]:
X = []
label = []
user = []
for i, matrix_path in enumerate(glob.glob(spectrogram_dir + "/*.npy")):
    filename = os.path.basename(matrix_path)[:-4]
    spectrogram = np.load(matrix_path)
    spectrogram = np.log10(spectrogram+1)
    parsed_filename = filename.split("_")
    label.append(int(parsed_filename[0]))
    user.append(parsed_filename[1])
    X.append(spectrogram.flatten().reshape(1,-1)[0])
    
y = np.array(label)
user = np.array(user)
X = np.array(X)
np.save("datas/scipy/y.npy", y)
np.save("datas/scipy/X.npy", X)
np.save("datas/scipy/user.npy", user)
In [7]:
y = np.load("datas/scipy/y.npy")
X = np.load("datas/scipy/X.npy")
user = np.load("datas/scipy/user.npy")
In [8]:
plt.figure(figsize = (20, 12))
plt.imshow(X, extent=[0, X.shape[1], 0, X.shape[0]], aspect='auto');
plt.colorbar(format='%+2.0f dB')
plt.title("Matrix of Amplitude for all audio", fontsize=20)
plt.xlabel("Spectrogram flattened", fontsize=15)
plt.ylabel("Audio", fontsize=15)
plt.show()