# 用于技术测试,不作为功能代码 import pywt import librosa import numpy as np from scipy.spatial.distance import cosine, correlation import tensorflow as tf import librosa import numpy as np # 定义一个函数,用于计算两个音频信号之间的相似度分数 def calculate_similarity(audio_file1, audio_file2): # 加载音频文件,并提取多种不同的特征 audio1, sr1 = librosa.load(audio_file1) audio2, sr2 = librosa.load(audio_file2) chroma1 = librosa.feature.chroma_stft(audio1, sr=sr1) chroma2 = librosa.feature.chroma_stft(audio2, sr=sr2) mfcc1 = librosa.feature.mfcc(audio1, sr=sr1) mfcc2 = librosa.feature.mfcc(audio2, sr=sr2) # 计算特征之间的相似度分数 chroma_similarity = 1 - np.mean(librosa.feature.delta(chroma1) * librosa.feature.delta(chroma2)) mfcc_similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1)) correlation_similarity = 1 - np.mean(correlation(chroma1, chroma2)) # 计算最终的相似度分数 similarity = (chroma_similarity + mfcc_similarity + correlation_similarity) / 3.0 return similarity # 加载两个音频文件 # 定义一个函数,用于计算两个音频信号之间的相似度分数 def calculate_similarity1(audio_file1, audio_file2): # 加载音频文件,并提取多种不同的特征 audio1, sr1 = librosa.load(audio_file1, sr=16000) audio2, sr2 = librosa.load(audio_file2, sr=16000) # 如果采样率不同,将其转换为相同的采样率 if sr1 != sr2: audio2 = librosa.resample(audio2, sr2, sr1) sr2 = sr1 # 对音频信号进行短时傅里叶变换(STFT)和滑动窗口分帧处理,获取音频特征图 stft1 = tf.signal.stft(audio1, frame_length=1024, frame_step=256, pad_end=True) stft2 = tf.signal.stft(audio2, frame_length=1024, frame_step=256, pad_end=True) spectrogram1 = tf.abs(stft1) spectrogram2 = tf.abs(stft2) print(spectrogram1.shape) spectrogram1 = tf.expand_dims(spectrogram1, axis=-1) print(spectrogram1.shape) spectrogram2 = tf.expand_dims(spectrogram2, axis=-1) print(spectrogram2.shape) # 使用卷积神经网络(CNN)对音频特征图进行特征提取 model = tf.keras.Sequential([ tf.keras.layers.Input(shape=(spectrogram1.shape[0], spectrogram1.shape[1], spectrogram1.shape[2])), tf.keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same'), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(16, activation='relu'), tf.keras.layers.Dense(8, activation='relu'), ]) model.build(input_shape=(None, spectrogram1.shape[0], spectrogram1.shape[1], spectrogram1.shape[2])) model.summary() model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # history = model.fit(epochs=5) model.save('model.h5') # 加载预训练模型参数 model.load_weights('model.h5') # 对音频信号进行预测,获取音频特征向量 print("变化后的shape{}".format(spectrogram1.shape)) spectrogram1 = tf.expand_dims(spectrogram1, axis=0) spectrogram2 = tf.expand_dims(spectrogram2, axis=0) print(spectrogram1.shape) features1 = model.predict(spectrogram1) features2 = model.predict(spectrogram2) # 计算特征向量之间的余弦相似度分数 similarity = np.dot(features1, features2.T) / (np.linalg.norm(features1) * np.linalg.norm(features2)) return similarity # base_41 = # base_39 = # res = calculate_similarity1(base_41, base_41) # print(res) def calculate_similarity2(audio_file1, audio_file2): """ 比较音频的相似度 Args: audio_file1: 短的标准=音 audio_file2: 长的录制音 Returns: """ # 加载音频文件,并提取多种不同的特征 audio1, sr1 = librosa.load(audio_file1, sr=16000) audio2, sr2 = librosa.load(audio_file2, sr=16000) # 如果采样率不同,将其转换为相同的采样率 if sr1 != sr2: audio2 = librosa.resample(audio2, sr2, sr1) sr2 = sr1 # 对音频信号进行短时傅里叶变换(STFT)和滑动窗口分帧处理,获取音频特征图 stft1 = tf.signal.stft(audio1, frame_length=1024, frame_step=256, pad_end=True) stft2 = tf.signal.stft(audio2, frame_length=1024, frame_step=256, pad_end=True) spectrogram1 = tf.abs(stft1) spectrogram2 = tf.abs(stft2) spectrogram1 = tf.expand_dims(spectrogram1, axis=-1) spectrogram2 = tf.expand_dims(spectrogram2, axis=-1) print("输入的shape:") print(spectrogram1.shape) print(spectrogram2.shape) # 使用卷积神经网络(CNN)对音频特征图进行特征提取 model = tf.keras.Sequential([ tf.keras.layers.Input(shape=(spectrogram1.shape[0], spectrogram1.shape[1], spectrogram1.shape[2])), tf.keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same'), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(32, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(16, activation='relu'), tf.keras.layers.Dense(8, activation='relu'), ]) model.build(input_shape=(None, spectrogram1.shape[0], spectrogram1.shape[1], spectrogram1.shape[2])) model.summary() model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy']) model.save('model.h5') # 加载预训练模型参数 model.load_weights('model.h5') # 计算第二个音频的STFT特征的相似度 similarity_scores = [] window_size = spectrogram1.shape[0] print("nihao") print(window_size) hop_length = window_size // 2 spectrogram1 = tf.expand_dims(spectrogram1, axis=0) stand_feature1 = model.predict(spectrogram1) print(spectrogram1.shape) print(spectrogram2.shape) for i in range(0, spectrogram2.shape[1] - window_size): window = spectrogram2[i:i + window_size, :, :] print("Shape========================") print(spectrogram1.shape) print(window.shape) f2 = model.predict(window[np.newaxis, ...]) similarity = np.dot(stand_feature1, f2.T) / (np.linalg.norm(stand_feature1) * np.linalg.norm(f2)) print("相似度:{}".format(similarity)) similarity_scores.append(similarity.tolist()) return similarity_scores # 已经快最近最优的解了 res_list = calculate_similarity2(base_,base_41) print(res_list)
依赖于OpenAi问答得到的程序,需要继续调优
于 2023-05-10 13:31:49 首次发布
该代码实现了一个计算音频文件之间相似度的系统,使用librosa库提取MFCC和chroma等特征,结合余弦相似度和相关性计算初步相似度。此外,还利用卷积神经网络(CNN)进一步提取音频的STFT特征,并通过计算特征向量的余弦相似度来评估相似性。
摘要由CSDN通过智能技术生成