Skip to content
Course Notes: Machine Learning for Time Series Data in Python
Machine Learning for Time Series Data
Soundwave Data
import librosa as lr
from glob import glob
# List all the wav files in the folder
audio_files = glob(data_dir + '/*.wav')
# Read in the first audio file, create the time array
audio, sfreq = lr.load(audio_files[0])
time = np.arange(0, len(audio)) / sfreq
# Plot audio over time
fig, ax = plt.subplots()
ax.plot(time, audio)
ax.set(xlabel='Time (s)', ylabel='Sound Amplitude')
plt.show()
fig, axs = plt.subplots(3, 2, figsize=(15, 7), sharex=True, sharey=True)
# Calculate the time array
time = np.arange(normal.shape[0]) / sfreq
# Stack the normal/abnormal audio so you can loop and plot
stacked_audio = np.hstack([normal, abnormal]).T
# Loop through each audio file / ax object and plot
# .T.ravel() transposes the array, then unravels it into a 1-D vector for looping
for iaudio, ax in zip(stacked_audio, axs.T.ravel()):
ax.plot(time, iaudio)
show_plot_and_make_titles()
# Average across the audio files of each DataFrame
mean_normal = np.mean(normal, axis=1)
mean_abnormal = np.mean(abnormal, axis=1)
# Plot each average over time
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 3), sharey=True)
ax1.plot(time, mean_normal)
ax1.set(title="Normal Data")
ax2.plot(time, mean_abnormal)
ax2.set(title="Abnormal Data")
plt.show()
Rolling Mean
# Rectify the audio signal
audio_rectified = audio.apply(abs)
# Smooth by applying a rolling mean
audio_rectified_smooth = audio_rectified.rolling(50).mean()
# Calculate stats
means = np.mean(audio_rectified_smooth, axis=0)
stds = np.std(audio_rectified_smooth, axis=0)
maxs = np.max(audio_rectified_smooth, axis=0)
# Create the X and y arrays
X = np.column_stack([means, stds, maxs])
y = labels.reshape(-1, 1)
# Fit the model and score on testing data
from sklearn.model_selection import cross_val_score
percent_score = cross_val_score(model, X, y, cv=5)
print(np.mean(percent_score))
Tempo
# Calculate the tempo of the sounds
tempos = []
for col, i_audio in audio.items():
tempos.append(lr.beat.tempo(i_audio.values, sr=sfreq, hop_length=2**6, aggregate=None))
# Convert the list to an array so you can manipulate it more easily
tempos = np.array(tempos)
# Calculate statistics of each tempo
tempos_mean = tempos.mean(axis=-1)
tempos_std = tempos.std(axis=-1)
tempos_max = tempos.max(axis=-1)
# Create the X and y arrays
X = np.column_stack([means, stds, maxs, tempos_mean, tempos_std, tempos_max])
y = labels.reshape(-1, 1)
# Fit the model and score on testing data
percent_score = cross_val_score(model, X, y, cv=5)
print(np.mean(percent_score))
Spectrograms using STFT
# Import the stft function
from librosa.core import stft
# Prepare the STFT
HOP_LENGTH = 2**4
spec = stft(audio, hop_length=HOP_LENGTH, n_fft=2**7)
from librosa.core import amplitude_to_db
from librosa.display import specshow
# Convert into decibels
spec_db = amplitude_to_db(spec)
# Compare the raw audio to the spectrogram of the audio
fig, axs = plt.subplots(2, 1, figsize=(10, 10), sharex=True)
axs[0].plot(time, audio)
specshow(spec_db, sr=sfreq, x_axis='time', y_axis='hz', hop_length=HOP_LENGTH, ax=axs[1])
plt.show()
import librosa as lr
# Calculate the spectral centroid and bandwidth for the spectrogram
bandwidths = lr.feature.spectral_bandwidth(S=spec)[0]
centroids = lr.feature.spectral_centroid(S=spec)[0]
from librosa.core import amplitude_to_db
from librosa.display import specshow
# Convert spectrogram to decibels for visualization
spec_db = amplitude_to_db(spec)
# Display these features on top of the spectrogram
fig, ax = plt.subplots(figsize=(10, 5))
specshow(spec_db, x_axis='time', y_axis='hz', hop_length=HOP_LENGTH, ax=ax)
ax.plot(times_spec, centroids)
ax.fill_between(times_spec, centroids - bandwidths / 2, centroids + bandwidths / 2, alpha=.5)
ax.set(ylim=[None, 6000])
plt.show()
# Loop through each spectrogram
bandwidths = []
centroids = []
for spec in spectrograms:
# Calculate the mean spectral bandwidth
this_mean_bandwidth = np.mean(lr.feature.spectral_bandwidth(S=spec))
# Calculate the mean spectral centroid
this_mean_centroid = np.mean(lr.feature.spectral_centroid(S=spec))
# Collect the values
bandwidths.append(this_mean_bandwidth)
centroids.append(this_mean_centroid)
# Create X and y arrays
X = np.column_stack([means, stds, maxs, tempo_mean, tempo_max, tempo_std, bandwidths, centroids])
y = labels.reshape(-1, 1)
# Fit the model and score on testing data
percent_score = cross_val_score(model, X, y, cv=5)
print(np.mean(percent_score))
Stocks Data
# Read in the data
data = pd.read_csv('prices.csv', index_col=0)
# Convert the index of the DataFrame to datetime
data.index = pd.to_datetime(data.index)
print(data.head())
# Loop through each column, plot its values over time
fig, ax = plt.subplots()
for column in data.columns:
data[column].plot(ax=ax, label=column)
ax.legend()
plt.show()
Cleaning Missing Data