音乐速度与节拍估计（一）基本方法

SoYouTry

已于 2022-02-15 16:32:15 修改

阅读量2.8k

点赞数 1

分类专栏：操作步骤与实践音频音乐技术文章标签：音频编码解码 python 动态规划

于 2022-02-04 17:55:00 首次发布

本文链接：https://blog.csdn.net/SoYouTry/article/details/122784693

版权

音频音乐技术同时被 2 个专栏收录

6 篇文章

订阅专栏

操作步骤与实践

5 篇文章

订阅专栏

转载自我的个人网站 https://wzw21.cn/2022/02/04/tempo-baseline/

使用Librosa库对音乐速度、节拍进行估计的基本方法

参考：https://tempobeatdownbeat.github.io/tutorial/ch2_basics/baseline.html

基本设置

import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np

mount = False

from google.colab import drive
drive.mount('/content/drive')
# drive._mount('/content/drive') # failed on Jan 21st, 2022
mount = True

if mount == True:
  filename = "drive/MyDrive/data/tempo_tutorial/audio/book_assets_ch2_basics_audio_easy_example"
else:
  filename = "book_assets_ch2_basics_audio_easy_example"
sr = 44100
fps = 100
hop_length = int(librosa.time_to_samples(1./fps,sr=sr)) # 441
# this Calculation is new to me
n_fft = 2048
# length of fft window
fmin = 27.5
fmax = 17000.
n_mels = 80
# number of Mel bands to generate

y, sr = librosa.load(filename+".flac", sr=sr)

时频特征（Mel-Spectrogram）

# Mel-spectrogram
mel_spec = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft,
                      hop_length=hop_length,
                      fmin=fmin, fmax=fmax,
                      n_mels=n_mels)
# melspectrogram's defult parameters
# fmax = sr / 2
# win_length = n_fft
# power = 2 # exponent for the magnitude melspectrogram. e.g., 1 for energy, 2 for power, etc.

# If a time-series input y, sr is provided, then its magnitude spectrogram S is first computed, and then mapped onto the mel scale by mel_f.dot(S**power).

fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(14,6))
librosa.display.waveplot(y, sr=sr, alpha=0.6, ax=ax[0])
# alpha controls the color transparency
ax[0].set_title('Audio waveform', fontsize=15)
ax[0].label_outer()
# only show "outer" labels and tick labels

librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), # convert a power spectrogram (amplitude(?) squared) to decibel (dB) units
            y_axis='mel', x_axis='time', sr=sr,
            hop_length=hop_length, fmin=fmin, fmax=fmax,
            ax=ax[1])
ax[1].set_title('Mel Spectrogram', fontsize=15)
ax[1].label_outer()

在这里插入图片描述

中层特征（Spectral Flux）

# Mid-level Representation (Spectral Flux)

# Onset strength at time t is determined by:
# mean_f max(0, S[f, t] - ref[f, t - lag])
# where ref is S after local max filtering along the frequency axis.

S = mel_spec # pre-computed (log-power) spectrogram
lag = 2 # time lag for computing differences
max_size = 3 # size (in frequency bins) of the local max filter， set to 1 to disable filtering
spectral_flux = librosa.onset.onset_strength(S=librosa.power_to_db(S, ref=np.max),
                        sr=sr, hop_length=hop_length,
                        lag=lag, max_size=max_size)
# Compute a spectral flux onset strength envelope

times = librosa.frames_to_time(np.arange(len(spectral_flux)),
                 sr=sr, hop_length=hop_length)
# or librosa.times_like(spectral_flux, sr=sr, hop_length=hop_length)

plt.figure(figsize=(14, 3))
plt.plot(times, spectral_flux, label="Spectral flux")
plt.title("Spectral flux")
plt.legend() # show legend
plt.show()

在这里插入图片描述

速度估计（Autocorrelation）

# Periodicity Detection and Tempogram

fig, ax = plt.subplots(nrows=3, figsize=(14, 12))

tempogram = librosa.feature.tempogram(onset_envelope=spectral_flux,
                    sr=sr, hop_length=hop_length)
# Compute the tempogram: local autocorrelation of the onset strength envelope
# default win_length = 384: length of the onset autocorrelation window
# return_shape = (win_length, n)
# time lag changes from 0 to win_length (?) \
# when time_lag == win_length \
# there is no overlap between origin window and shifted window, thus autocorrelation should be 0 \
# but for global_ac (window is much larger) there are still a lot of overlops

librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length,
             x_axis='time', y_axis='tempo', cmap='magma',
             ax=ax[0])
# y_axis='tempo' visualizes the outputs of feature.tempogram
# cmap: color map
# win_length => BPM (?)

tempo = librosa.beat.tempo(onset_envelope=spectral_flux, sr=sr,
               hop_length=hop_length)[0]
# default aggregation function: mean
# shape = (1,) or (n,) if no aggregate
ax[0].axhline(tempo, color='w', linestyle='--', alpha=1,
       label='Estimated tempo={:g}'.format(tempo))
# this line shows the estimated global tempo
ax[0].legend(loc='upper right')
ax[0].set_title('Fig.2: Tempogram',fontsize=15)

ac_global = librosa.autocorrelate(spectral_flux, max_size=tempogram.shape[0])
# Compute global onset autocorrelation
# max_size: maximum correlation lag
ac_global = librosa.util.normalize(ac_global)

x_scale = np.linspace(start=0, stop=tempogram.shape[0] * float(hop_length) / sr,
            num=tempogram.shape[0])
# return evenly spaced numbers over a specified interval
ax[1].plot(x_scale, np.mean(tempogram, axis=1), label='Mean local autocorrelation')
ax[1].plot(x_scale, ac_global, '--', label='Global autocorrelation')
ax[1].legend(loc='upper right')
ax[1].set(xlabel='Lag (seconds)')

# ax[2]: map the lag scale into tempo, which is a prior distribution
freqs = librosa.tempo_frequencies(n_bins = tempogram.shape[0],
                  hop_length=hop_length, sr=sr)
# Compute the frequencies (in beats per minute) corresponding to an onset auto-correlation or tempogram matrix
# n_bins: the number of lag bins
# freqs[0] = +np.inf corresponds to 0-lag
# freqs[1] = 6000, [2] = 3000, [3] = 1500 ...
# freqs here means x_scale

ax[2].semilogx(freqs[1:], np.mean(tempogram[1:], axis=1),
        label='Mean local autocorrelation', basex=2)
ax[2].semilogx(freqs[1:], ac_global[1:], linestyle='--',
        label='Global autocorrelation', basex=2)
ax[2].axvline(tempo, color='black', linestyle='--',
       label='Estimated tempo={:g}'.format(tempo))

ax[2].legend(loc='upper right')
ax[2].set(xlabel='BPM')
# blue line taper off at higher periodicity (lower tempi) \
# due to the lack of overlap between the shifted versions of the windowed spectral flux

plt.show()

# Notice that in a variable tempo context, \
# it’s not super meaningful to reduce the tempo information down to a single value.

在这里插入图片描述

节拍跟踪（Dynamic Programming）

# Use DP to recover beats sequence (i.e., temporal locations)
# Ref: https://www.audiolabs-erlangen.de/resources/MIR/FMP/C6/C6S3_BeatTracking.html
'''
Pseudocode:
for i = 1 to N
  int tmp = 0;
  for j = 1 to i-1
    tmp = max(tmp, dp[j] + lamda * penalty(i-j));
  dp[i] = delta[i] + tmp

'''
def beat_track_dp(oenv, tempo, fps, sr, hop_length, tightness=100, alpha=0.5, ref_beats=None):

	period = (fps * 60./tempo) # beat period (given in samples)
	localscore = librosa.beat.__beat_local_score(oenv, period)
	"""Construct the local score for an onset envlope and given period"""
	# localscore is a smoothed version of AGC'd(?) onset envelope
	
	backlink = np.zeros_like(localscore, dtype=int) # save answers in DP process
	cumulative_score = np.zeros_like(localscore)
	
	# Search range for previous beat
	window = np.arange(-2 * period, -np.round(period / 2) + 1, dtype=int)
	
	txwt = -tightness * (np.log(-window / period) ** 2) 
	# penalty function
	# notice window is an array, so txwt saves not only one penalty value
	# tightness means tightness of beat distribution around tempo
	# higher tightness value favours constant tempi
	
	# Are we on the first beat?
	first_beat = True
	for i, score_i in enumerate(localscore):
	
	    # Are we reaching back before time 0?
	    z_pad = np.maximum(0, min(-window[0], len(window)))
	
	    # Search over all possible predecessors
	    candidates = txwt.copy()
	    candidates[z_pad:] = candidates[z_pad:] + cumulative_score[window[z_pad:]]
	
	    # Find the best preceding beat
	    beat_location = np.argmax(candidates)
	
	    # Add the local score
	    cumulative_score[i] = (1-alpha)*score_i + alpha*candidates[beat_location]
	
	    # Special case the first onset.  Stop if the localscore is small
	    if first_beat and score_i < 0.01 * localscore.max():
	        backlink[i] = -1
	    else:
	        backlink[i] = window[beat_location]
	        first_beat = False
	
	    # Update the time range
	    window = window + 1
	
	beats = [librosa.beat.__last_beat(cumulative_score)]
	"""Get the last beat from the cumulative score array"""
	# get the last (final) beat
	
	# Reconstruct the beat path from backlinks
	while backlink[beats[-1]] >= 0:
	    beats.append(backlink[beats[-1]])
	
	# Put the beats in ascending order
	# Convert into an array of frame numbers
	beats = np.array(beats[::-1], dtype=int)
	
	# Discard spurious trailing beats
	beats = librosa.beat.__trim_beats(oenv, beats, trim=True)
	"""Final post-processing: throw out spurious leading/trailing beats"""
	
	# Convert beat times seconds
	beats = librosa.frames_to_time(beats, hop_length=hop_length, sr=sr)
	
	return beats, cumulative_score

alpha = 0.5
tightness=100

est_beats, cumulative_score = beat_track_dp(oenv=spectral_flux, tempo=tempo, fps=fps, sr=sr, hop_length=hop_length, tightness=tightness, alpha=alpha)
fig, ax = plt.subplots(nrows=2, figsize=(14, 6))
times = librosa.times_like(spectral_flux, sr=sr, hop_length=hop_length)
ax[0].plot(times, spectral_flux, label='Spectral flux')
ax[0].set_title('Spectral flux',fontsize=15)
ax[0].label_outer()

ax[0].set(xlim=[0, len(spectral_flux)/fps])
ax[0].vlines(est_beats, 0, 1.1*spectral_flux.max(), label='Estimated beats', 
       color='green', linestyle=':', linewidth=2)
ax[0].legend(loc='upper right')

ax[1].plot(times, cumulative_score, color='orange', label='Cumultative score')
ax[1].set_title('Cumulative score (alpha:'+str(alpha)+')',fontsize=15)
ax[1].label_outer()
ax[1].set(xlim=[0, len(spectral_flux)/fps])
ax[1].vlines(est_beats, 0, 1.1*cumulative_score.max(), label='Estimated beats', 
       color='green', linestyle=':', linewidth=2)
ax[1].legend(loc='upper right')
ax[1].set(xlabel = 'Time')
plt.show()