讲述权重归一化在文本在conv1d上和全连接层dense上的应用,参考了simple-effective-text-matching的实现
import numpy as np
import tensorflow as tf
def gelu(x):
return 0.5 * x * (1 + tf.nn.tanh(x * 0.7978845608 * (1 + 0.044715 * x * x)))
def get_weight(shape, gain=np.sqrt(2), weight_norm=True, fan_in=None, name="weight"):
if fan_in is None:
fan_in = np.prod(shape[:-1])
std = gain / np.sqrt(fan_in) # He init
w = tf.get_variable(name, shape=shape, initializer=tf.initializers.random_normal(0, std),
dtype=tf.float32)
if weight_norm:
# 这里采用初始化器为1,所有在下面乘以*g的那行代码等于没有相乘,得到的结果一样(可以返回ww张量进行查看)
g = tf.get_variable("{}_g".format(name), shape=(1, ) * (len(shape) - 1) + (shape[-1], ),
initializer=tf.ones_initializer)
w_norm = tf.sqrt(tf.reduce_sum(tf.square(w), axis=list(range(len(shape) - 1)), keepdims=True))
# ww = w / tf.maximum(w_norm, 1e-7)
w = w / tf.maximum(w_norm, 1e-7) * g
return w
def apply_bias(x, name='bias'):
b = tf.get_variable(name, shape=[x.get_shape()[-1]], initializer=tf.zeros_initializer)
b = tf.cast(b, x.dtype)
b = tf.reshape(b, [1] * len(x.get_shape()[:-1]) + [x.get_shape().as_list()[-1]])
return x + b
def dense(x, units, activation=None, name="dense"):
"""
:param x: float tensor of shape [..., origin_units]
:param units: int, 全连接神经单元数
:param activation: tensorflow的激活函数,比如tf.nn.relu
:param name: str, 全连接变量域的名称
:return: float tensor of shape [..., units]
"""
with tf.variable_scope(name):
fan_in = x.shape[-1].value
new_shape = tf.concat([tf.shape(x)[:-1], tf.constant([units])], axis=0)
x = tf.reshape(x, (-1, fan_in))
gain = np.sqrt(2) if activation is tf.nn.relu else 1.0
w = get_weight([fan_in, units], gain=gain)
out = apply_bias(tf.matmul(x, w))
out = tf.reshape(out, new_shape)
if activation:
if activation is tf.nn.relu:
activation = gelu
out = activation(out)
return out
def conv1d(x, filters, kernel_size, activation=None, name='conv1d'):
"""
:param x: float tensor of shape [batch, seq_x, embedding_size], 代表着文本进行embedding之后的tensor
:param filters: int, 代表着卷积的数量
:param kernel_size: int, 代表着卷积核的大小(因为是一维卷积,所以代表着height)
:param activation: tf内置的激活函数
:param name: str, 整个卷积操作的变量域名称
:return: float tensor of shape [batch, seq_x, filters]
"""
with tf.variable_scope(name):
gain = np.sqrt(2) if activation is tf.nn.relu else 1
x = tf.expand_dims(x, 1)
w = get_weight([kernel_size, x.shape[-1].value, filters], gain=gain)
w = tf.expand_dims(w, 0)
out = tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
out = tf.squeeze(out, [1])
out = apply_bias(out)
if activation:
out = activation(out)
return out