每个人说话的内容虽然不一样,但是频率基本不会变,所以声音匹配原理是根据频率的分布情况。
import librosa
import os
from collections import Counter
import numpy as np
from functools import reduce
dirpath = "/Users/birenjianmo/Desktop/learn/librosa/input/分类"
def initdata(dirpath):
datainfo = {}
for root, dirs, files in os.walk(dirpath):
for file in files:
if os.path.splitext(file)[-1] in [".mp3", ".wav"]:
filepath = os.path.join(root, file)
datainfo[file] = getfrequeciesdistribute(filepath, 40)
# datainfo[file] = getfrequeciesdistribute2(filepath, 40)
return datainfo
def clean2(y, sr):
frequencies, D = librosa.ifgram(y, sr=sr)
top = abs(D).max() / 30
D[abs(D)<top] = 0
y = librosa.istft(D)
return y
def getfrequeciesdistribute(filepath, num=100):
y, sr = librosa.load(filepath)
y = clean2(y, sr)
frequencies, D = librosa.ifgram(y, sr=sr)
frequencies = frequencies.astype(int)
frequencies = frequencies // 2 * 2
c = Counter(frequencies.flatten().tolist())
data = c.most_common(num)
data = list(filter(lambda x:x[0]!=0,data))
s = reduce(lambda x1, x2: (0, x1[1] + x2[1]), data)[1]
frequeciesdistribute = np.array(list(map(lambda t: t[0] * t[1], data)))
frequeciesdistribute = frequeciesdistribute / s
print(frequeciesdistribute)
return frequeciesdistribute
def getfrequeciesdistribute2(filepath, num=100):
y, sr = librosa.load(filepath)
y = clean2(y, sr)
frequencies, D = librosa.ifgram(y, sr=sr)
frequencies = frequencies.astype(int)
frequencies = frequencies // 2 * 2
c = Counter(frequencies.flatten().tolist())
data = c.most_common(num)
data = list(filter(lambda x:x[0]!=0,data))
frequeciesdistribute = np.array(list(map(lambda t: t[0], data)))
return frequeciesdistribute
def main():
data = initdata(dirpath)
for name1 in data:
minlossname = name1
minloss = -1
for name2 in data:
if name1 == name2:
continue
loss = (data[name1] - data[name2]) **2
loss = loss.sum()
if minloss == -1 or loss < minloss:
minloss = loss
minlossname = name2
print(f"{name1} 相似 {minlossname} loss {minloss}")
if __name__ == '__main__':
main()
输出结果:
王小婷2.mp3 相似 王小婷1.mp3 loss 52119.23285928133
王小婷1.mp3 相似 王小婷2.mp3 loss 52119.23285928133
许森1.mp3 相似 许森2.mp3 loss 5928.603683607824
许森2.mp3 相似 许森1.mp3 loss 5928.603683607824
老男人2.mp3 相似 老男人1.mp3 loss 502.5641025641024
老男人3.mp3 相似 耿镇源2.mp3 loss 6546.419460880999
老男人1.mp3 相似 老男人2.mp3 loss 502.5641025641024
小新1.wav 相似 小新2.wav loss 1091.3372936798748
小新2.wav 相似 小新1.wav loss 1091.3372936798748
杜蕾蕾1.mp3 相似 杜蕾蕾3.mp3 loss 13920.36756477113
杜蕾蕾3.mp3 相似 小新1.wav loss 4795.619575262194
杜蕾蕾2.mp3 相似 小新1.wav loss 6461.275476660093
耿镇源1.mp3 相似 耿镇源2.mp3 loss 7457.570019723867
耿镇源2.mp3 相似 老男人3.mp3 loss 6546.419460880999
女人3.mp3 相似 许森2.mp3 loss 37871.214062747385
女人2.mp3 相似 女人1.mp3 loss 3239.918680026809
女人1.mp3 相似 女人2.mp3 loss 3239.918680026809
部分录音内容太短只有1秒,导致误差较大,在运行时应选择时间长的录音。