原创: Lebhoryi@rt-thread.com
时间: 2020/06/10
coding: google-research/kws_streaming/models/cnn.py
-
step1. 用
argparse
设定超参,包括卷积核等# sub parser for model settings subparsers = parser.add_subparsers(dest='model_name', help='NN model name') # CNN model settings parser_cnn = subparsers.add_parser('cnn') cnn.model_parameters(parser_cnn)
-
step2. build cnn
model = cnn.model(flags)
# 用zip 来解析超参,这一步简直是神来之笔,省了好多调用 for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides))
parse 函数:
def parse(text): if not text: return [] # ast.literal_eval() 也是一个很有意思的东西, # 建议花点时间学习一下 # 功能:将字符串还原成它能够转化成的数据类型 res = ast.literal_eval(text) if isinstance(res, tuple): return res else: return [res]
接下来就是搭建网络了,用for循环,搭建7个卷积层,
忽略掉Stream()
就是普通的卷积函数,
然后跟上一些其他的层:- Flattern()
- Dropout()
- 两个FC,也是
for
循环 - 最后一层输出层
-
step3. 查看网络结构
logging.info(model.summary())
# coding=utf-8
# Copyright 2020 The Google Research Authors.
# cnn.py
"""CNN model with Mel spectrum."""
from kws_streaming.layers import modes
from kws_streaming.layers import speech_features
from kws_streaming.layers.compat import tf
from kws_streaming.layers.stream import Stream
from kws_streaming.models.utils import parse
def model_parameters(parser_nn):
"""Covolutional Neural Network(CNN) model parameters."""
parser_nn.add_argument(
'--cnn_filters',
type=str,
default='64,64,64,64,128,64,128',
help='Number of output filters in the convolution layers',
)
parser_nn.add_argument(
'--cnn_kernel_size',
type=str,
default='(3,3),(5,3),(5,3),(5,3),(5,2),(5,1),(10,1)',
help='Heights and widths of the 2D convolution window',
)
parser_nn.add_argument(
'--cnn_act',
type=str,
default="'relu','relu','relu','relu','relu','relu','relu'",
help='Activation function in the convolution layers',
)
parser_nn.add_argument(
'--cnn_dilation_rate',
type=str,
default='(1,1),(1,1),(2,1),(1,1),(2,1),(1,1),(2,1)',
help='Dilation rate to use for dilated convolutions',
)
parser_nn.add_argument(
'--cnn_strides',
type=str,
default='(1,1),(1,1),(1,1),(1,1),(1,1),(1,1),(1,1)',
help='Strides of the convolution layers along the height and width',
)
parser_nn.add_argument(
'--dropout1',
type=float,
default=0.5,
help='Percentage of data dropped',
)
parser_nn.add_argument(
'--units2',
type=str,
default='128,256',
help='Number of units in the last set of hidden layers',
)
parser_nn.add_argument(
'--act2',
type=str,
default="'linear','relu'",
help='Activation function of the last set of hidden layers',
)
def model(flags):
"""CNN model.
It is based on paper:
Convolutional Neural Networks for Small-footprint Keyword Spotting
http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
Args:
flags: data/model parameters
Returns:
Keras model for training
"""
input_audio = tf.keras.layers.Input(
shape=modes.get_input_data_shape(flags, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE),
batch_size=flags.batch_size)
net = input_audio
if flags.preprocess == 'raw':
# it is a self contained model, user need to feed raw audio only
net = speech_features.SpeechFeatures(
speech_features.SpeechFeatures.get_params(flags))(
net)
net = tf.keras.backend.expand_dims(net)
for filters, kernel_size, activation, dilation_rate, strides in zip(
parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
parse(flags.cnn_strides)):
net = Stream(
cell=tf.keras.layers.Conv2D(
filters=filters,
kernel_size=kernel_size,
activation=activation,
dilation_rate=dilation_rate,
strides=strides))(
net)
net = Stream(cell=tf.keras.layers.Flatten())(net)
net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)
for units, activation in zip(parse(flags.units2), parse(flags.act2)):
net = tf.keras.layers.Dense(units=units, activation=activation)(net)
net = tf.keras.layers.Dense(units=flags.label_count)(net)
return tf.keras.Model(input_audio, net)
总结:
用(for
+zip
)的方式可以省去好多重复代码,网络结构搭建的很赏心悦目,配上网络参数用argparse
设定,见过的最优雅的CNN
代码构建,没有之一
# 附上之前的CNN 构建源码 `2 Conv2d + linear + 2 FC`
def create_cnn_model2(fingerprint_input, model_settings, model_size_info,
is_training):
"""Builds a model with 2 convolution layers followed by a linear layer and
a hidden fully-connected layer.
model_size_info: defines the first and second convolution parameters in
{number of conv features, conv filter height, width, stride in y,x dir.},
followed by linear layer size and fully-connected layer size.
"""
if is_training:
dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
input_frequency_size = model_settings['dct_coefficient_count']
input_time_size = model_settings['spectrogram_length']
fingerprint_4d = tf.reshape(fingerprint_input,
[-1, input_time_size, input_frequency_size, 1])
first_filter_count = model_size_info[0]
first_filter_height = model_size_info[1] #time axis
first_filter_width = model_size_info[2] #frequency axis
first_filter_stride_y = model_size_info[3] #time axis
first_filter_stride_x = model_size_info[4] #frequency_axis
second_filter_count = model_size_info[5]
second_filter_height = model_size_info[6] #time axis
second_filter_width = model_size_info[7] #frequency axis
second_filter_stride_y = model_size_info[8] #time axis
second_filter_stride_x = model_size_info[9] #frequency_axis
linear_layer_size = model_size_info[10]
fc_size = model_size_info[11]
# first conv
first_weights = tf.Variable(
tf.truncated_normal(
[first_filter_height, first_filter_width, 1, first_filter_count],
stddev=0.01))
first_bias = tf.Variable(tf.zeros([first_filter_count]))
first_conv = tf.nn.conv2d(fingerprint_4d, first_weights, [
1, first_filter_stride_y, first_filter_stride_x, 1
], 'VALID') + first_bias
first_conv = tf.layers.batch_normalization(first_conv, training=is_training,
name='bn1')
first_relu = tf.nn.relu(first_conv)
if is_training:
first_dropout = tf.nn.dropout(first_relu, dropout_prob)
else:
first_dropout = first_relu
first_conv_output_width = math.ceil(
(input_frequency_size - first_filter_width + 1) /
first_filter_stride_x)
first_conv_output_height = math.ceil(
(input_time_size - first_filter_height + 1) /
first_filter_stride_y)
# second conv
second_weights = tf.Variable(
tf.truncated_normal(
[second_filter_height, second_filter_width, first_filter_count,
second_filter_count],
stddev=0.01))
second_bias = tf.Variable(tf.zeros([second_filter_count]))
second_conv = tf.nn.conv2d(first_dropout, second_weights, [
1, second_filter_stride_y, second_filter_stride_x, 1
], 'VALID') + second_bias
##### update 2020/05/21 #######
second_conv = tf.layers.batch_normalization(second_conv, training=is_training,
name='bn2')
second_relu = tf.nn.relu(second_conv)
if is_training:
second_dropout = tf.nn.dropout(second_relu, dropout_prob)
else:
second_dropout = second_relu
second_conv_output_width = math.ceil(
(first_conv_output_width - second_filter_width + 1) /
second_filter_stride_x)
second_conv_output_height = math.ceil(
(first_conv_output_height - second_filter_height + 1) /
second_filter_stride_y)
second_conv_element_count = int(
second_conv_output_width*second_conv_output_height*second_filter_count)
flattened_second_conv = tf.reshape(second_dropout,
[-1, second_conv_element_count])
# linear layer
W = tf.get_variable('W', shape=[second_conv_element_count, linear_layer_size],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('b', shape=[linear_layer_size])
flow = tf.matmul(flattened_second_conv, W) + b
# first fc
first_fc_output_channels = fc_size
first_fc_weights = tf.Variable(
tf.truncated_normal(
[linear_layer_size, first_fc_output_channels], stddev=0.01))
first_fc_bias = tf.Variable(tf.zeros([first_fc_output_channels]))
first_fc = tf.matmul(flow, first_fc_weights) + first_fc_bias
first_fc = tf.layers.batch_normalization(first_fc, training=is_training,
name='bn3')
first_fc = tf.nn.relu(first_fc)
if is_training:
final_fc_input = tf.nn.dropout(first_fc, dropout_prob)
else:
final_fc_input = first_fc
label_count = model_settings['label_count']
final_fc_weights = tf.Variable(
tf.truncated_normal(
[first_fc_output_channels, label_count], stddev=0.01))
final_fc_bias = tf.Variable(tf.zeros([label_count]))
final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
if is_training:
return final_fc, dropout_prob
else:
return final_fc