前言
SpeakerBeam发表与interspeech2018,这项技术需要预先说话人信息,然后从混合语音中提取相应的语音,SpeakerBeam与之前的传统语音分割算法相比,第一无需知道这段语音说话人位置,个数等信息,更符合实际场景的需要。
如图所示,SpeakerBeam包括两个模型,混合模型:混合语音在STFT后相加,提取模型:提取目标语音。
文章中所提到的SpeakerBeam有多种,但是最终最好的一种如图所示: the scaled activations method and sequence summarization with attention模式。SpeakerBeam分两个模块:主网络和联合网络。
代码展示
联合网络:
def AUX(x): ######联合网络
conv1 = tf.nn.relu(tf.layers.conv1d(x,200,1,strides=1, padding='valid'))#linear + relu
conv2 = tf.layers.conv1d(conv1,512,1,strides=1, padding='valid')#linear
attention = tf.reshape(tf.nn.softmax(tf.keras.layers.GlobalAvgPool1D()(conv2),-1),[-1,1,512])
results = tf.reshape(tf.keras.layers.GlobalAvgPool1D()(conv2*attention),[-1,1,512])
return results
BLSTM+linear:
def BLSTM_linear(x):
# lstm模型正方向传播的RNN
lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(512, forget_bias=1.0)
# 反方向传播的RNN
lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(512, forget_bias=1.0)
(outputs, output_states) = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32)
fw_outputs,bw_outputs = outputs
out_results = tf.concat([fw_outputs,bw_outputs],-1)
out_results = tf.layers.conv1d(out_results,512,1,strides=1, padding='valid')
return out_results
SpeakerBeam:
def SpeakerBeam(x1,x2): ######主干网络
layer1 = tf.nn.tanh(BLSTM_linear(x1)*AUX(x2))
#print(layer1)
layer2 = tf.nn.tanh(BLSTM_linear(layer1))
layer3 = tf.nn.tanh(BLSTM_linear(layer2))
layer4 = tf.nn.sigmoid(tf.layers.conv1d(layer3,512,1,strides=1, padding='valid'))
return layer4
#return layer1
总代码
import tensorflow as tf
inputs1 = tf.placeholder(tf.float32,[None,None,257])#输入数据
inputs2 = tf.placeholder(tf.float32,[None,None,257])#输入数据
def AUX(x): ######联合网络
conv1 = tf.nn.relu(tf.layers.conv1d(x,200,1,strides=1, padding='valid'))#linear + relu
conv2 = tf.layers.conv1d(conv1,512,1,strides=1, padding='valid')#linear
attention = tf.reshape(tf.nn.softmax(tf.keras.layers.GlobalAvgPool1D()(conv2),-1),[-1,1,512])
results = tf.reshape(tf.keras.layers.GlobalAvgPool1D()(conv2*attention),[-1,1,512])
return results
def BLSTM_linear(x):
# lstm模型正方向传播的RNN
lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(512, forget_bias=1.0)
# 反方向传播的RNN
lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(512, forget_bias=1.0)
(outputs, output_states) = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32)
fw_outputs,bw_outputs = outputs
out_results = tf.concat([fw_outputs,bw_outputs],-1)
out_results = tf.layers.conv1d(out_results,512,1,strides=1, padding='valid')
return out_results
def SpeakerBeam(x1,x2): ######主干网络
layer1 = tf.nn.tanh(BLSTM_linear(x1)*AUX(x2))
#print(layer1)
layer2 = tf.nn.tanh(BLSTM_linear(layer1))
layer3 = tf.nn.tanh(BLSTM_linear(layer2))
layer4 = tf.nn.sigmoid(tf.layers.conv1d(layer3,512,1,strides=1, padding='valid'))
return layer4
#return layer1
SpeakerBeam(inputs1,inputs2)