本文包括语音端点检测,自相关,求基音周期,AR系数,语音合成,LPC求码本,最近文献的方法小波变换求基音周期。在完整文件下载地址里包含有十分规范的实验报告以及讲解和语音包以及各种函数,还包括有小波变换的MATLAB文件
完整文件下载
语音端点检测部分:
%%addpath(genpath('sap-voicebox-master/voicebox'))
[x,Fs]=audioread('1.wav');
x = x / max(abs(x));
FrameLen = 256;
inc = 90;
amp1 = 10;
amp2 = 2;
zcr1 = 10;
zcr2 = 5;
minsilence = 6;
minlen = 15;
status = 0;
count = 0;
silence = 0;
tmp1 = enframe(x(1:end-1), FrameLen,inc);
tmp2 = enframe(x(2:end) , FrameLen,inc);
signs = (tmp1.*tmp2)<0;
diffs = (tmp1 -tmp2)>0.02;
zcr = sum(signs.*diffs,2);
amp = sum((abs(enframe(filter([1 -0.9375], 1, x), FrameLen, inc))).^2, 2);
amp1 = min(amp1, max(amp)/4);
amp2 = min(amp2, max(amp)/8);
for n=1:length(zcr)
goto = 0;
switch status
case {0,1}
if amp(n) > amp1
x1 = max(n-count-1,1);
status = 2;
silence = 0;
count = count + 1;
elseif amp(n) > amp2 || zcr(n) > zcr2
status = 1;
count = count + 1;
else
status = 0;
count = 0;
end
case 2,
if amp(n) > amp2 ||zcr(n) > zcr2
count = count + 1;
else
silence = silence+1;
if silence < minsilence
count = count + 1;
elseif count < minlen
status = 0;
silence = 0;
count = 0;
else
status = 3;
end
end
case 3,
break;
end
end
count = count-silence/2;
x2 = x1 + count -1;
subplot(3,1,1)
plot(x)
axis([1 length(x) -1 1])
xlabel('帧数');ylabel('Speech');
line([x1*inc x1*inc], [-1 1], 'Color', 'red');
line([x2*inc x2*inc], [-1 1], 'Color', 'red');
subplot(3,1,2)
plot(amp);
axis([1 length(amp) 0 max(amp)])
xlabel('帧数');ylabel('Energy');
line([x1 x1], [min(amp),max(amp)], 'Color', 'red');
line([x2 x2], [min(amp),max(amp)], 'Color', 'red');
subplot(3,1,3)
plot(zcr);
axis([1 length(zcr) 0 max(zcr)])
xlabel('帧数');ylabel('ZCR');
line([x1 x1], [min(zcr),max(zcr)], 'Color', 'red');
line([x2 x2], [min(zcr),max(zcr)], 'Color', 'red');
备注处添加自己语音工具包的路径。语音包里面有个函数需要用到
求一帧语音的自相关函数
x=audioread('1.wav');
s1=x(1:320);
N=320; %选择的窗长,加N=320的矩形窗
A=[];
for k=1:320;
sum=0;
for m=1:N-(k-1);
sum=sum+s1(m)*s1(m+k-1); %计算自相关
end
A(k)=sum;
end
for k=1:320
A1(k)=A(k)/A(1); %归一化A(k)
end
N=160; %选择的窗长,%加N=160的矩形窗
B=[];
for k=1:320;
sum=0;
for m=1:N-(k-1);
sum=sum+s1(m+k-1); %计算自相关
end
B(k)=sum;
end
for k=1:320
B1(k)=B(k)/B(1); %归一化B(k)
end
N=70; %选择的窗长,加N=70的矩形窗
C=[];
for k=1:320;
sum=0;
for m=1:N-(k-1);
sum=sum+s1(m)*s1(m+k-1); %计算自相关
end
C(k)=sum;
end
for k=1:320
C1(k)=C(k)/C(1); %归一化C(k)
end
s2=s1/max(s1);
figure(1)
subplot(4,1,1)
plot(s2)
title('语音信号')
xlabel('样点数')
ylabel('幅值')
axis([0,320,-2,2])
subplot(4,1,2)
plot(A1)
xlabel('延时k')
ylabel('R(k)')
axis([1,320,-2,2]);
legend('N=320')
subplot(4,1,3)
plot(B1);
xlabel('延时k')
ylabel('R(k)')
axis([1,320,-2,2]);
legend('N=160')
subplot(4,1,4)
plot(C1);
xlabel('延时k')
ylabel('R(k)')
axis([0,320,-2,2]);
legend('N=70')
找基音周期
[x,fs]=audioread('1.wav');
wlen=320; inc=80; % 分帧的帧长和帧移
overlap=wlen-inc; % 帧之间的重叠部分
T1=0.05; % 设置基音端点检测的参数
x=x-mean(x); % 消去直流分量
x=x/max(abs(x)); % 幅值归一化
y= enframe(x,wlen,inc)'; % 分帧
fn = size(y,2); % 取得帧数
time = (0 : length(x)-1)/fs; % 计算时间坐标
frameTime = frame2time(fn, wlen, inc, fs); % 计算各帧对应的时间坐标
[voiceseg,vosl,SF,Ef]=pitch_vad1(y,fn,T1); % 基音的端点检测
% 滤波器系数
b=[0.012280 -0.039508 0.042177 0.000000 -0.042177 0.039508 -0.012280];
a=[1.000000 -5.527146 12.854342 -16.110307 11.479789 -4.410179 0.713507];
xx=filter(b,a,x); % 带通数字滤波
yy = enframe(xx,wlen,inc)'; % 滤波后信号分帧
lmin=fix(fs/500); % 基音周期的最小值
lmax=fix(fs/60); % 基音周期的最大值
period=zeros(1,fn); % 基音周期初始化
period=ACF_corr(yy,fn,voiceseg,vosl,lmax,lmin); % 用自相关函数提取基音周期
T0=pitfilterm1(period,voiceseg,vosl); % 平滑处理
% 作图
subplot 211, plot(time,x,'k'); title('语音信号')
axis([0 max(time) -1 1]); grid; ylabel('幅值');
subplot 212; plot(frameTime,T0,'k'); hold on;
xlim([0 max(time)]); title('平滑后的基音周期');
grid; xlabel('时间/s'); ylabel('样点数');
for k=1 : vosl
nx1=voiceseg(k).begin;
nx2=voiceseg(k).end;
nxl=voiceseg(k).duration;
fprintf('%4d %4d %4d %4d\n',k,nx1,nx2,nxl);
subplot 211
line([frameTime(nx1) frameTime(nx1)],[-1 1],'color','red','linestyle','-');
line([frameTime(nx2) frameTime(nx2)],[-1 1],'color','k','linestyle','--');
end
求AR系数
N = 1024;
u = audioread('1.wav');
y(1:2) = u(1:2);
y(3) = -0.69*y(1)+u(3);
y(4) = -0.69*y(2)-0.044*y(1)+u(4);
for n=5:N
y(n) = -0.69*y(n-2)-0.044*y(n-3)+0.0672*y(n-4)+u(n);
end
%M是预测器的阶数,改变M即可改变预测器的阶数
%ry是训练数据的输入自相关
M = 4; %M为预测器的阶数
[ry] = myxcorr(y,M+1);
Ry = toeplitz(ry(1:M));
Rxy = ry(2:M+1)';
w = inv(Ry)*Rxy;
w = w';
%根据levinson迭代公式实现如下函数
%M是预测器的阶数,改变M即可改变预测器的阶数
%ry是训练数据的输入自相关
%init
p(1) = ry(1);
for m = 1:M
a(m,1) = 1;
a(m,m+1) = 0;
tmp = 0;
for L = 0:m-1
tmp = tmp + ry(m-L+1)*a(m-1+1,L+1);
end
delt(m) = tmp; %delt(1)相当于delta(0)
k(m) = -delt(m)/p(m); %k(1)就是看k(1)
p(m+1) = p(m)*(1-abs(k(m))^2); %p(m+1)相当于p(m)就是p(2)相当于p(1)
for L = 0:m
a(m+1,m-L+1) = a(m-1+1,m-L+1) + k(m)*a(m-1+1,L+1);
end
end
levinson_w = -a(M+1,2:M+1)
levinson_a = a(M+1,:)
求LPC码本
I = audioread('1.wav');%读入原始语音
subplot(3,1,1),plot(I);
title('原始语音波形')
%对指定帧位置进行加窗处理
Q = I';
N = 256; % 窗长
Hamm = hamming(N); % 加窗
frame = 30;%需要处理的帧位置
M = Q(((frame - 1) * (N / 2) + 1):((frame - 1) * (N / 2) + N));
Frame = M .* Hamm';%加窗后的语音帧
[B,F,T] = specgram(I,N,N/2,N);
[m,n] = size(B);
for i = 1:m
FTframe1(i) = B(i,frame);
end
P =2; %input('请输入预测器阶数 = ');
ai = lpc(Frame,P); % 计算lpc系数
LP = filter([0 -ai(2:end)],1,Frame); % 建立语音帧的正则方程
FFTlp = fft(LP);
E = Frame - LP; % 预测误差
subplot(3,1,2),plot(1:N,Frame,1:N,LP,'-r');grid;
title('原始语音和预测语音波形')
subplot(3,1,3),plot(E);grid;
title('预测误差');
pause
fLength(1 : 2 * N) = [M,zeros(1,N)];
Xm = fft(fLength,2 * N);
X = Xm .* conj(Xm);
Y = fft(X , 2 * N);
Rk = Y(1 : N);
PART = sum(ai(2 : P + 1) .* Rk(1 : P));
G = sqrt(sum(Frame.^2) - PART);
A = (FTframe1 - FFTlp(1 : length(F'))) ./ FTframe1 ;
subplot(2,1,1),plot(F',20*log(abs(FTframe1)),F',(20*log(abs(1 ./ A))),'-r');grid;
xlabel('频率/dB');ylabel('幅度');
title('短时谱');
subplot(2,1,2),plot(F',(20*log(abs(G ./ A))));grid;
xlabel('频率/dB');ylabel('幅度');
title('LPC谱');
pause
%求出预测误差的倒谱
pitch = fftshift(rceps(E));
M_pitch = fftshift(rceps(Frame));
subplot(2,1,1),plot(M_pitch);grid;
xlabel('语音帧');ylabel('/dB');
title('原始语音帧倒谱');
subplot(2,1,2),plot(pitch);grid;
xlabel('语音帧');ylabel('/dB');
title('预测误差倒谱');
pause
%画出语谱图
ai1 = lpc(I,P); % 计算原始语音lpc系数
LP1 = filter([0 -ai(2:end)],1,I); % 建立原始语音的正则方程
subplot(2,1,1);
specgram(I,N,N/2,N);
title('原始语音语谱图');
subplot(2,1,2);
specgram(LP1,N,N/2,N);
title('预测语音语谱图');
语音合成
[y1,fs]=audioread('1.wav'); %读取语音一信号
[y2,fs]=audioread('2.wav'); %读取语音二信号
L1=length(y1); %测定语音一信号长度
L2=length(y2); %测定语音二信号长度
a1=y1.*hamming(L1); %加窗预处理
a2=y2.*hamming(L2); %加窗预处理
L1=length(a1); %测定语音一信号长度
L2=length(a2); %测定语音二信号长度
%采样信号的时域显示
figure(1);
subplot(211);
plot(a1);
title('语音一载波信号时域波形');
subplot(212);
plot(a2);
title('语音二调幅信号时域波形');
%傅里叶频谱绘制
F1=fft(a1,L1);
F2=fft(a2,L2);
AF1=abs(F1);
AF2=abs(F2);
figure(2);
subplot(211);
plot(AF1);
title('语音一载波信号幅频特性显示');
subplot(212);
plot(AF2);
title('语音二调幅信号幅频特性显示');
figure(3);
freqz(F1);
title('语音一载波信号FFT频谱显示');
figure(4);
freqz(F2);
title('语音二载波信号FFT频谱显示');
%获取语音一信号的开始位置
for i=1:L1-4
g(i)=a1(i).*a1(i+1).*a1(i+2).*a1(i+3).*a1(i+4);%认为连续4个幅值不为0的信号即为开始
if g(i)~=0
break;
else i=i+1;
end
end
I=i;
%获取语音一信号的开始位置
for i=1:L1-4
g(i)=a1(i).*a1(i+1).*a1(i+2).*a1(i+3).*a1(i+4);%认为连续4个幅值不为0的信号即为开始
if g(i)~=0
break;
else i=i+1;
end
end
I=i;
% 获取语音二信号开始位置
for j=1:L2-4
m(j)=a2(j).*a2(j+1).*a2(j+2).*a2(j+3).*a2(j+4);
if m(j)~=0
break;
else j=j+1;
end
end
J=j;
%语音二信号hilbert变换
H=hilbert(a2);
figure(5);
plot(abs(H));
title('语音二信号包络显示');
%信号对齐,语音二包络调制语音一振幅
max1=max(I,J);
for k=1:L1-max1
N(k)=a1(i).*H(j);
i=i+1;
j=j+1;
end
%N=N';
N = N/(max(abs(N)) * 1.05);
%wavwrite(N,16000,16,'HC.wav');
figure(6);
plot(imag(N));
title('合成信号时域显示');
pause(1);
FN=fft(N);
figure(7);
freqz(FN);
title('合成声音信号FFT显示');
figure(8);
plot(abs(FN));
title('合成声音信号的幅频特性');
自定义函数部分:
1.assign,m
function f=assign1(a)
%b=[0.012280 -0.039508 0.042177 0.000000 -0.042177 0.039508 -0.012280];
a=[1.000000 -5.527146 12.854342 -16.110307 11.479789 -4.410179 0.713507];
findSegment,m
function soundSegment=findSegment(express)
if express(1)==0
voicedIndex=find(express); % 寻找express中为1的位置
else
voicedIndex=express;
end
soundSegment = [];
k = 1;
soundSegment(k).begin = voicedIndex(1); % 设置第一组有话段的起始位置
for i=1:length(voicedIndex)-1,
if voicedIndex(i+1)-voicedIndex(i)>1, % 本组有话段结束
soundSegment(k).end = voicedIndex(i); % 设置本组有话段的结束位置
soundSegment(k+1).begin = voicedIndex(i+1);% 设置下一组有话段的起始位置
k = k+1;
end
end
soundSegment(k).end = voicedIndex(end); % 最后一组有话段的结束位置
% 计算每组有话段的长度
for i=1 :k
soundSegment(i).duration=soundSegment(i).end-soundSegment(i).begin+1;
end
fra.m 部分
%fra.m
function f=fra(len,inc,x) %?
fh=fix(((size(x,1)-len)/inc)+1)%
f=zeros(fh,len);
i=1;n=1;
while i<=fh %?
j=1;
while j<=len
f(i,j)=x(n);
j=j+1;
n=n+1;
end
n=n-len+inc;%
i=i+1;
end
frame2time.m
function frameTime=frame2time(frameNum,framelen,inc,fs)
frameTime=(((1:frameNum)-1)*inc+framelen/2)/fs;
function zcr = get_st_zcr( x,fs,wlen_time,step_time,win_type )
if(min(size(x))>1)
end
wlen = round(wlen_time * fs);
nstep = round(step_time * fs);
if nargin < 5
win = hamming(wlen);
elseif narmin == 5
if strcmp(win_type, 'hamming')
win = hamming(wlen);
elseif strcmp(win_type, 'hanning')
win = hanning(wlen);
else
win = hamming(wlen);
end
else
win = hamming(wlen);
end
nFrames = floor((length(x) - wlen)/nstep) + 1;
zcr = [];
for k = 1:nFrames
idx = (k-1) * nstep + (1:wlen);
x_sub = x(idx) .* win;
x_sub1 = x_sub(1:end-1);
x_sub2 = x_sub(2:end);
zcr(k) = sum(abs(sign(x_sub1) - sign(x_sub2))) / 2 / length(x_sub1);
end
linsmoothm.m
function [y] = linsmoothm(x,n)
if nargin< 2
n=3;
end
win=hanning(n); % 用hanning窗
win=win/sum(win); % 归一化
x=x(:)'; % 把x转换为行序列
len=length(x);
y= zeros(len,1); % 初始化y
% 对x序列前后补n个数,以保证输出序列与x相同长
if mod(n, 2) ==0
l=n/2;
x = [ones(1,1)* x(1) x ones(1,l)* x(len)]';
else
l=(n-1)/2;
x = [ones(1,1)* x(1) x ones(1,l+1)* x(len)]';
end
% 线性平滑处理
for k=1:len
y(k) = win'* x(k:k+ n- 1);
end
myxcorr.m
function [ rx ] = myxcorr( x, M )
%该函数用于求出 x 的自相关函数,从r(1)到r(M)
% 输入x为一个1 X N 维矩阵的矩阵,M阶数
% 输出rx为自相关函数
N = length(x);
rx = zeros(1,M);
for i = 1:M %加一是为了求Rxy
rx(i) = x(i:N)*x(1:N-i+1)'/(N-i+1);
end
% Rx = toeplitz(rx);
end
pinghua.m
function pt2=pinghua(pt)
%平滑--------
%1.找出平滑段,五个点就足够了
dpt=diff(pt);
dpt=abs(dpt);
for i=3:length(dpt)-2
dptsum(i-2)=sum(dpt(i-2:i+2));
end
[z,w]=find(dptsum<45);
w1=w(1);
pt1=pt(w1:end);
pt12=pt(1:w1);
%2.初步平滑
c1=10;
c2=25;
%首先由平滑段向后
for i=2:length(pt1)-1
if abs(pt1(i)/2-pt1(i-1))<c1
pt1(i)=pt1(i)/2;
elseif abs(pt1(i)*2-pt1(i-1))<c1
pt1(i)=pt1(i)*2;
elseif (abs(pt1(i)-pt1(i-1))>c1)&(abs(pt1(i+1)-pt1(i-1))>c2)
pt1(i)=2*pt1(i-1)-pt1(i-2);
elseif (abs(pt1(i)-pt1(i-1))>c1)&(abs(pt1(i+1)-pt1(i-1))<c2)
pt1(i)=(pt1(i-1)+pt1(i+1))/2;
end
end
%处理最后一个点
if abs(pt1(end)/2-pt1(end-1))<c1
pt1(end)=pt1(end)/2;
elseif abs(pt1(end)*2-pt1(end-1))<c1
pt1(end)=pt1(end)*2;
end
%再由平滑段向前处理
if w1>3
for i=w1-1:-1:2
if abs(pt12(i)/2-pt12(i+1))<c1
pt12(i)=pt12(i)/2;
elseif abs(pt12(i)*2-pt12(i+1))<c1
pt12(i)=pt12(i)*2;
elseif (abs(pt12(i)-pt12(i+1))>c1)&(abs(pt12(i-1)-pt12(i+1))>c2)
pt12(i)=2*pt12(i+1)-pt12(i+2);
elseif (abs(pt12(i)-pt12(i+1))>c1)&(abs(pt12(i-1)-pt12(i+1))<c2)
pt12(i)=(pt12(i-1)+pt12(i+1))/2;
end
end
%处理第一个点
if abs(pt12(1)/2-pt12(1+1))<c1
pt12(1)=pt12(1)/2;
elseif abs(pt12(1)*2-pt12(1+1))<c1
pt12(1)=pt12(1)*2;
end
pt1=[pt12(1:end-1),pt1];
end
pt2=pt1;
%3.进一步调整
%4.中值平滑
for i=3:length(pt1)-2
pt2(i)=sum(pt1(i-2:i+2))/5;
end
pitch_vad1.m
function [voiceseg,vosl,SF,Ef]=pitch_vad1(y,fn,T1,miniL)
if nargin<4, miniL=10; end
if size(y,2)~=fn, y=y'; end
wlen=size(y,1);
for i=1:fn
Sp = abs(fft(y(:,i)));
Sp = Sp(1:wlen/2+1);
Esum(i) = sum(Sp.*Sp);
prob = Sp/(sum(Sp));
H(i) = -sum(prob.*log(prob+eps));
end
hindex=find(H<0.1);
H(hindex)=max(H);
Ef=sqrt(1 + abs(Esum./H));
Ef=Ef/max(Ef);
zindex=find(Ef>=T1);
zseg=findSegment(zindex);
zsl=length(zseg);
j=0;
SF=zeros(1,fn);
for k=1 : zsl
if zseg(k).duration>=miniL
j=j+1;
in1=zseg(k).begin;
in2=zseg(k).end;
voiceseg(j).begin=in1;
voiceseg(j).end=in2;
voiceseg(j).duration=zseg(k).duration;
SF(in1:in2)=1;
end
end
vosl=length(voiceseg);
pitfilterm1.m
function y=pitfilterm1(x,vseg,vsl)
y=zeros(size(x)); % 初始化
for i=1 : vsl % 有段数据
ixb=vseg(i).begin; % 该段的开始位置
ixe=vseg(i).end; % 该段的结束位置
u0=x(ixb:ixe); % 取来一段数据
y0=medfilt1(u0,5); % 5点的中值滤波
v0=linsmoothm(y0,5); % 线性平滑
y(ixb:ixe)=v0; % 赋值给y
end
zcro.m
%zcro.m
function f=zcro(x)
f=zeros(size(x,1),1); %
for i=1:size(x,1)
z=x(i,:);
for j=1:(length(z)-1);
if z(j)*z(j+1)<0;
f(i)=f(i)+1;
end
end
end