声音三要素为响度,音调和音色。
将音频根据所需像素大小分为等窗口大小的多段音频进行分析
1.音调
求出每段音频的基频作为音调的值f0
2.响度
将音频时域上包络的均方值作为音频的响度值,测试后觉得比取峰值更能体现每段音频之间的响度关系
3.音色
音色的不同体现在基频和其谐波之间的关系,所以采用STFT对音频进行变换并去除人耳听力范围以外的频率后,取导数的平均值表示音色。(这里应该能用更好的值体现其关系)
最后将三个值分别作为RGB的值组成图像
clear;
clc;
y = 80;
x = 80; % 分辨率
numofpixel = x * y;
[sampledata, FS] = audioread('C:\CloudMusic\Philosophy of yours.mp3');
%根据分辨率和输入的音频确定每个窗口的长度
timeofsound = length(sampledata) / FS;
time_perpixel = timeofsound / (x * y);
winLength = round(time_perpixel * FS);
overlapLength = 0;
left_data = sampledata(:, 1); %此作业只生成一个声道的图像
right_data = sampledata(:, 2);
%提取出每个窗口的基频,作为音调的值
[f0, idx] = pitch(left_data, FS, 'Method', 'SRH', 'WindowLength', winLength, 'OverlapLength', overlapLength);
tf0 = idx / FS;
%取幅度谱上窗口长度的包络的均方值作为该点的响度的值
temp = envelope(left_data, winLength, 'rms');
temp = temp * 10000;
loudness = 20 * log10(temp);
% 控制求得的响度点的数量
desiredPoints = numofpixel;
% 插值以得到指定数量的响度点
time = (0:length(left_data) - 1) / FS;
newTime = linspace(min(time), max(time), desiredPoints);
newLoudness = interp1(time, loudness, newTime);
newLoudness=abs(newLoudness);
smoothnessValues = zeros(numofpixel, 1);
% 对每个窗口计算包络平滑度
for i = 1:length(f0)
% 提取当前窗口的信号
windowStart = (i - 1) * (winLength - overlapLength) + 1;
windowEnd = windowStart + winLength - 1;
windowedSignal = left_data(windowStart:windowEnd);
[S, f, t] = spectrogram(windowedSignal, hamming(winLength), overlapLength, winLength, FS);
% 确保包络取值在人耳听力范围之内
freqRange = f >= 20 & f <= 20000;
freqRange = freqRange & f <= max(f);
envelope=abs(S).*freqRange;
% 使用平滑操作
smoothEnvelope = movmean(envelope, 10); % 这里窗口大小可以调整
% 计算包络的平滑度(取导数的平均值)
smoothnessValues(i) = mean(diff(smoothEnvelope));
end
%补全数组方便生成图像
TX=1:6400;
size_f=numofpixel-length(f0);
for pp=1:size_f
f0=cat(1,f0,0);
end
%调试使用
% subplot(6,1,1);
% plot(TX,smoothnessValues);
% subplot(6,1,3);
% plot(TX,f0);
% subplot(6,1,5);
% plot(TX,newLoudness);
smoothnessValues_min=min(smoothnessValues);
smoothnessValues(:,1) = smoothnessValues(:,1)-smoothnessValues_min;
smoothnessValues_max=max(smoothnessValues);
smoothnessValues(:)=smoothnessValues(:)*(225/smoothnessValues_max);
smoothnessValues=uint8(smoothnessValues);
f0_min=min(f0);
f0(:,1) = f0(:,1)-f0_min;
f0_max=max(f0);
f0(:)=f0(:)*(225/f0_max);
f0=uint8(f0);
%去除响度过高的点
for ll=1:numofpixel
if(newLoudness(ll)>70)
newLoudness(ll)=0;
end
end
newLoudness_min=min(newLoudness);
newLoudness(:) = newLoudness(:)-newLoudness_min;
newLoudness_max=max(newLoudness);
newLoudness(:)=newLoudness(:)*(225/newLoudness_max);
newLoudness=uint8(newLoudness);
%调试使用
% subplot(6,1,2);
% plot(TX,smoothnessValues);
% subplot(6,1,4);
% plot(TX,f0);
% subplot(6,1,6);
% plot(TX,newLoudness);
newLoudness=reshape(newLoudness,[x,y]);
f0=reshape(f0,[x,y]);
smoothnessValues=reshape(smoothnessValues,[x,y]);
picture=cat(3,newLoudness,f0,smoothnessValues);
image(picture)