音频功能
音频特征是语音特征,它不包含来自转录本的特征,也不包含从语音到文本模型输出的文本字符串。
Audio Feature | Description | Use case |
---|---|---|
Mel spectrogram frequency coefficients (mfcc) | 在人声范围内变窄的频带(通常是13种类型,但可以更多) | Classifying phonemes. |
Mel spectrogram frequency delta coefficients (mfcc) | 上述mfcc系数的变化。 | Classifying phonemes. |
Fundamental frequency | 周期性声音波形的最低频率。 | Useful for classifying genders. |
Jitter | 基频的周期间变化。 | Useful for speaker recognition and pathological voice quality. |
Shimmer | 振幅的周期变化。 | Useful for speaker recognition and pathological voice quality. |
Formant frequencies | 高频共振可以通过确定第一共振峰的所有奇数倍来计算(例如乘以1,3,5,7,…:Fz=F1 x(2z-1))。 | Detecting intratracheal lengths. |
File duration | 音频文件的长度 | Detecting speaking rates. |
Root mean squared (RMS) energy | 在一段时间内发射到麦克风中的能量的平均值。 | Detecting stress or new environments. |
Spectral centroid | 音频频谱的质心,或信号中频率的加权平均值。其中x(n)表示箱子编号n的加权频率值或量级,f(n)表示该箱子的中心频率。 | Characterizes ‘brightness’ of sound (timbre). |
Spectral flux | 音频信号的功率谱从一帧到下一帧的变化有多快。 | Characterizes environments. |
Onset strength | 对声音记录开始和停止的功率的测量。 | Helps localize sound sources. |
Spectral contrast | 波峰和波谷之间的分贝差。. | Helps to detect noise in samples. |
Spectral flatness | 帮助评估音频信号自相似性的熵度量。 | Useful for noise detection (1) compared to tone-signals (0). |
Spectral rolloff | 总能量集中的频率。 | Speech bandwidth characterization and segmentation (diarization). |
Zero-crossing rates | 音频样本中符号变化的速率(+/-)或(-/+)。 | Useful to measure periodicity and detect voices. |
librosa features
librosa_features.py
import librosa
import numpy as np
# get statistical features in numpy
def stats(matrix):
mean=np.mean(matrix)
std=np.std(matrix)
maxv=np.amax(matrix)
minv=np.amin(matrix)
median=np.median(matrix)
output=np.array([mean,std,maxv,minv,median])
return output
# featurize with librosa following documentation
# https://librosa.github.io/librosa/feature.html
def librosa_featurize(filename, categorize):
# if categorize == True, output feature categories
print('librosa featurizing: %s'%(filename))
y, sr = librosa.load(filename)
# FEATURE EXTRACTION
######################################################
# extract major features using librosa
mfcc=librosa.feature.mfcc(y)
poly_features=librosa.feature.poly_features(y)
chroma_cens=librosa.feature.chroma_cens(y)
chroma_cqt=librosa.feature.chroma_cqt(y)
chroma_stft=librosa.feature.chroma_stft(y)
tempogram=librosa.feature.tempogram(y)
spectral_centroid=librosa.feature.spectral_centroid(y)[0]
spectral_bandwidth=librosa.feature.spectral_bandwidth(y)[0]
spectral_contrast=librosa.feature.spectral_contrast(y)[0]
spectral_flatness=librosa.feature.spectral_flatness(y)[0]
spectral_rolloff=librosa.feature.spectral_rolloff(y)[0]
onset=librosa.onset.onset_detect(y)
onset=np.append(len(onset),stats(onset))
tempo=librosa.beat.tempo(y)[0]
onset_features=np.append(onset,tempo)
onset_strength=librosa.onset.onset_strength(y)
zero_crossings=librosa.feature.zero_crossing_rate(y)[0]
rmse=librosa.feature.rmse(y)[0]
# FEATURE CLEANING
######################################################
# onset detection features
onset_features=np.append(onset_features,stats(onset_strength))
# rhythm features (384) - take the first 13
rhythm_features=np.concatenate(np.array([stats(tempogram[0]),
stats(tempogram[1]),
stats(tempogram[2]),
stats(tempogram[3]),
stats(tempogram[4]),
stats(tempogram[5]),
stats(tempogram[6]),
stats(tempogram[7]),
stats(tempogram[8]),
stats(tempogram[9]),
stats(tempogram[10]),
stats(tempogram[11]),
stats(tempogram[12])]))
# spectral features (first 13 mfccs)
spectral_features=np.concatenate(np.array([stats(mfcc[0]),
stats(mfcc[1]),
stats(mfcc[2]),
stats(mfcc[3]),
stats(mfcc[4]),
stats(mfcc[5]),
stats(mfcc[6]),
stats(mfcc[7]),
stats(mfcc[8]),
stats(mfcc[9]),
stats(mfcc[10]),
stats(mfcc[11]),
stats(mfcc[12]),
stats(poly_features[0]),
stats(poly_features[1]),
stats(spectral_centroid),
stats(spectral_bandwidth),
stats(spectral_contrast),
stats(spectral_flatness),
stats(spectral_rolloff)]))
# power features
power_features=np.concatenate(np.array([stats(zero_crossings),
stats(rmse)]))
# you can also concatenate the features
if categorize == True:
# can output feature categories if true
features={'onset':onset_features,
'rhythm':rhythm_features,
'spectral':spectral_features,
'power':power_features}
else:
# can output numpy array of everything if we don't need categorizations
features = np.concatenate(np.array([onset_features,
rhythm_features,
spectral_features,
power_features]))
return features
features=librosa_featurize('test.wav', False)
pyaudioanalysis features
pyaudio_features.py
import os,json
import numpy as np
def stats(matrix):
mean=np.mean(matrix)
std=np.std(matrix)
maxv=np.amax(matrix)
minv=np.amin(matrix)
median=np.median(matrix)
output=np.array([mean,std,maxv,minv,median])
return output
def pyaudio_featurize(file):
# use pyaudioanalysis library to export features
# exported as file[0:-4].json
os.system('python pyaudio_help.py %s'%(file))
jsonfile=file[0:-4]+'.json'
g=json.load(open(jsonfile))
features=np.array(g['features'])
# now go through all the features and get statistical features for array
new_features=list()
all_labels=['zero crossing rate','energy','entropy of energy','spectral centroid',
'spectral spread', 'spectral entropy', 'spectral flux', 'spectral rolloff',
'mfcc1','mfcc2','mfcc3','mfcc4',
'mfcc5','mfcc6','mfcc7','mfcc8',
'mfcc9','mfcc10','mfcc11','mfcc12',
'mfcc13','chroma1','chroma2','chroma3',
'chroma4','chroma5','chroma6','chroma7',
'chroma8','chroma9','chroma10','chroma11',
'chroma12','chroma deviation']
labels=list()
for i in range(len(features)):
tfeature=stats(features[i])
for j in range(len(tfeature)):
new_features.append(tfeature[j])
if j==0:
labels.append('mean '+all_labels[i])
elif j==1:
labels.append('std '+all_labels[i])
elif j==2:
labels.append('max '+all_labels[i])
elif j==3:
labels.append('min '+all_labels[i])
elif j==4:
labels.append('median '+all_labels[i])
new_features=np.array(new_features)
os.remove(jsonfile)
return new_features, labels
features, labels =pyaudio_featurize('test.wav')
SoX features
sox_features.py
import os
import numpy as np
def clean_text(text):
text=text.lower()
chars=['a','b','c','d','e','f','g','h','i','j','k','l','m',
'o','p','q','r','s','t','u','v','w','x','y','z',' ',
':', '(',')','-','=',"'.'"]
for i in range(len(chars)):
text=text.replace(chars[i],'')
text=text.split('\n')
new_text=list()
# now get new text
for i in range(len(text)):
try:
new_text.append(float(text[i].replace('\n','').replace('n','')))
except:
pass
#print(text[i].replace('\n','').replace('n',''))
return new_text
def sox_featurize(filename):
# soxi and stats files
soxifile=filename[0:-4]+'_soxi.txt'
statfile=filename[0:-4]+'_stats.txt'
os.system('soxi %s > %s'%(filename, soxifile))
os.system('sox %s -n stat > %s 2>&1'%(filename, statfile))
# get basic info
s1=open(soxifile).read()
s1_labels=['channels','samplerate','precision',
'duration','filesize','bitrate','sample encoding']
s1=clean_text(s1)
s2=open(statfile).read()
s2_labels=['samples read','length','scaled by','maximum amplitude',
'minimum amplitude','midline amplitude','mean norm','mean amplitude',
'rms amplitude','max delta','min delta','mean delta',
'rms delta','rough freq','vol adj']
s2=clean_text(s2)
labels=s1_labels+s2_labels
features=np.array(s1+s2)
return features,labels
features, labels = sox_featurize('test.wav')
Audioset features
audioset_features.py
# ^^ ... (down to main script in audioset_features.py)
# get current directory
curdir=os.getcwd()
# download audioset files if audioset not in current directory
if 'audioset' not in os.listdir():
try:
setup_audioset(curdir)
except:
print('there was an error installing audioset')
# record a 10 second, mono 16k Hz audio file in the current directory
filename='test.wav'
sync_record(filename,10,16000,1)
# now let's featurize an audio sample in the current directory, test.wav
features, new_features =audioset_featurize(filename)
# print('new features')
# print(new_features)
# print(len(new_features))