# -- encoding:utf-8 --
import os
import tensorflow as tf
class CBOWNetwork(object):
def __init__(self, name="W2V", num_sampled=100, window=4, vocab_size=3365, embedding_size=128, is_mean=True,
regularization=0.001, optimizer_name='adam', learning_rate=0.01, checkpoint_dir="./running/model"):
self.name = name # 网络名称
self.vocab_size = vocab_size # 词汇数目
self.embedding_size = embedding_size # 词向量转换的时候,转换的向量维度大小
self.is_mean = is_mean # 合并数据的时候是否进行均值操作
self.window = window # 窗口大小,也就是上下单词数目(除中心词外)
self.num_sampled = num_sampled # 抽样的时候,抽取的类别数目(单词数目)
self.regularization = regularization # 正则化项系数
self.optimizer_name = optimizer_name.lower() # 优化器的名称
self.learning_rate = learning_rate # 学习率
self.adam_beta1 = 0.9 # Adam优化器参数
self.adam_beta2 = 0.999 # Adam优化器参数
self.epsilon = 1e-8 # Adam、Adadelta优化器参数
self.adadelta_rho = 0.95 # Adadelta优化器参数
self.checkpoint_dir = checkpoint_dir # 模型持久化文件夹
self.checkpoint_path = os.path.join(self.checkpoint_dir, "{}.ckpt".format(self.name.lower()))
# 模型持久化文件夹校验
if not os.path.exists(self.checkpoint_dir):
os.makedirs(self.checkpoint_dir)
self.input_x = None # [B,T]
self.target = None # [B,1]
self.training = None # []
self.global_step = None # []
self.features = None # [B,E]
self.embedding_table = None # [V,E]
self.saver = None # 模型参数恢复、持久化等操作对象
def interface(self):
"""
前向网络的构建
:return:
"""
with tf.variable_scope(self.name):
# 一、定义输入
with tf.variable_scope("placeholder"):
self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, self.window], name="input_x") # [B,T]
self.target = tf.placeholder(dtype=tf.int32, shape=[None, 1], name="target") # [B,1]
self.training = tf.placeholder_with_default(True, shape=[], name="training")
self.global_step = tf.train.get_or_create_global_step()
# 二、Embedding操作,将单词id转换为词向量
with tf.variable_scope("embedding"), tf.device("/cpu:0"):
# a. 定义词汇转换列表
self.embedding_table = tf.get_variable("embedding_table",
shape=[self.vocab_size, self.embedding_size],
dtype=tf.float32)
# b. 将单词id转换为词向量, [B,T] --> [B,T,E]
vectors = tf.nn.embedding_lookup(params=self.embedding_table, ids=self.input_x)
# 三、对于输入值进行合并,得到最终的特征属性
with tf.variable_scope("merge"):
if self.is_mean:
# 对T个单词的向量进行合并,合并方式采用均值的方式, [B,T,E] --> [B,E]
features = tf.reduce_mean(vectors, axis=1)
else:
# 对T个单词的向量进行合并,合并方式采用sum的方式, [B,T,E] --> [B,E]
features = tf.reduce_sum(vectors, axis=1)
# 属性给定
self.features = tf.identity(features, "features")
def losses(self):
"""
损失函数的构建
:return:
"""
with tf.variable_scope("loss"):
# 0. 定义参数
weight = tf.get_variable(name="weight", shape=[self.vocab_size, self.embedding_size])
bias = tf.get_variable(name="bias", shape=[self.vocab_size])
def train_loss():
"""
训练阶段的损失函数构建
:return:
"""
_loss = tf.nn.sampled_softmax_loss(
weights=weight, # 输出转换系数w,形状为: [V,E]
biases=bias, # 输出转换系数b,形状为: [V,]
labels=self.target, # 实际类别下标,形状为: [B,num_true], num_true表示每个样本存在几个预测标签
inputs=self.features, # 前向过程提取出来的特征信息,形状为: [B,E]
num_sampled=self.num_sampled, # 针对每个批次,会随机抽取多少个类别(负例)
num_classes=self.vocab_size, # 总类别数目,也就是单词词汇数目
num_true=1 # 给定每个样本对应预测标签有多少个,真实的类别,实际类别
)
_loss = tf.reduce_mean(_loss, name="train_loss")
return _loss
def eval_loss():
"""
数据验证时候的损失函数构建(不能够近似的计算,那么必须全连接)
:return:
"""
# 1. 做一个全连接操作,得到对应的logits值: [B,V]
logits = tf.nn.bias_add(tf.matmul(self.features, weight, transpose_b=True), bias=bias)
# 2. 对实际值进行转换
labels = tf.reshape(self.target, shape=[-1])
# 3. 损失函数构建
_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, # [N1,N2,....Nn], eg: [B,] 实际所属类别id
logits=logits # [N1,N2,....Nn,num_class], eg: [B,V] 预测属于各个类别的置信度
)
_loss = tf.reduce_mean(_loss, name="eval_loss")
return _loss
# 1. 根据训练或者验证的参数,来获取对应的损失
loss = tf.cond(
pred=self.training, # 判断条件,Tensor对象
true_fn=train_loss, # 当pred为True的时候,返回true_fn这个参数的返回结果
false_fn=eval_loss # 当pred为False的时候,返回false_fn这个参数的返回结果
)
tf.summary.scalar('loss', loss)
# 2. 将变量的L2 Loss加入到损失函数中
l2_loss = tf.nn.l2_loss(self.embedding_table) + tf.nn.l2_loss(weight) + tf.nn.l2_loss(bias)
l2_loss = self.regularization * l2_loss
tf.summary.scalar('l2_loss', l2_loss)
# 3. 所有损失合并到一起
total_loss = loss + l2_loss
tf.summary.scalar('total_loss', total_loss)
return total_loss
def optimizer(self, loss):
"""
损失函数的构建
:param loss:
:return:
"""
with tf.variable_scope("train"):
# 1. 构建优化器
if self.optimizer_name == 'adam':
opt = tf.train.AdamOptimizer(
learning_rate=self.learning_rate,
beta1=self.adam_beta1,
beta2=self.adam_beta2,
epsilon=self.epsilon
)
elif self.optimizer_name == 'adadelta':
opt = tf.train.AdadeltaOptimizer(
learning_rate=self.learning_rate,
rho=self.adadelta_rho,
epsilon=self.epsilon
)
elif self.optimizer_name == 'adagrad':
opt = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
else:
opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
# 2. 训练对象构建
train_op = opt.minimize(loss=loss, global_step=self.global_step)
return opt, train_op
def metrics(self, loss=None):
"""
模型评估值的构建
:param loss:
:return:
"""
pass
def restore(self, session):
"""
模型参数恢复
:param session:
:return:
"""
# 0. 参数判断
if self.saver is None:
self.saver = tf.train.Saver()
# 1. 所有参数初始化
session.run(tf.global_variables_initializer())
# 2. 从checkpoint_dir文件夹中进行模型恢复操作
ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
tf.logging.info("Restore model weight from '{}'".format(ckpt.model_checkpoint_path))
self.saver.restore(session, save_path=ckpt.model_checkpoint_path)
self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)
def save(self, session):
"""
模型持久化
:param session:
:return:
"""
# 0. 参数判断
if self.saver is None:
self.saver = tf.train.Saver()
# 1. 保存操作
tf.logging.info("Store the model weight to '{}'".format(self.checkpoint_path))
self.saver.save(session, save_path=self.checkpoint_path, global_step=self.global_step)
class SkipGramNetwork(object):
def __init__(self, name="W2V", num_sampled=100, window=4, vocab_size=3365, embedding_size=128,
regularization=0.001, optimizer_name='adam', learning_rate=0.01, checkpoint_dir="./running/model"):
self.name = name # 网络名称
self.vocab_size = vocab_size # 词汇数目
self.embedding_size = embedding_size # 词向量转换的时候,转换的向量维度大小
self.window = window # 窗口大小,也就是上下单词数目(除中心词外)
self.num_sampled = num_sampled # 抽样的时候,抽取的类别数目(单词数目)
self.regularization = regularization # 正则化项系数
self.optimizer_name = optimizer_name.lower() # 优化器的名称
self.learning_rate = learning_rate # 学习率
self.adam_beta1 = 0.9 # Adam优化器参数
self.adam_beta2 = 0.999 # Adam优化器参数
self.epsilon = 1e-8 # Adam、Adadelta优化器参数
self.adadelta_rho = 0.95 # Adadelta优化器参数
self.checkpoint_dir = checkpoint_dir # 模型持久化文件夹
self.checkpoint_path = os.path.join(self.checkpoint_dir, "{}.ckpt".format(self.name.lower()))
# 模型持久化文件夹校验
if not os.path.exists(self.checkpoint_dir):
os.makedirs(self.checkpoint_dir)
self.input_x = None # [B,1]
self.target = None # [B,T]
self.training = None # []
self.global_step = None # []
self.features = None # [B,E]
self.embedding_table = None # [V,E]
self.saver = None # 模型参数恢复、持久化等操作对象
def interface(self):
"""
前向网络的构建
:return:
"""
with tf.variable_scope(self.name):
# 一、定义输入
with tf.variable_scope("placeholder"):
self.input_x = tf.placeholder(dtype=tf.int32, shape=[None, 1], name="input_x") # [B,1]
self.target = tf.placeholder(dtype=tf.int32, shape=[None, self.window], name="target") # [B,T]
self.training = tf.placeholder_with_default(True, shape=[], name="training")
self.global_step = tf.train.get_or_create_global_step()
# 二、Embedding操作,将单词id转换为词向量
with tf.variable_scope("embedding"), tf.device("/cpu:0"):
# a. 定义词汇转换列表
self.embedding_table = tf.get_variable("embedding_table",
shape=[self.vocab_size, self.embedding_size],
dtype=tf.float32)
# b. 将单词id转换为词向量, [B,1] --> [B,1,E]
vectors = tf.nn.embedding_lookup(params=self.embedding_table, ids=self.input_x)
# 三、对于输入值进行合并,得到最终的特征属性
with tf.variable_scope("merge"):
# 特征压缩, [B,1,E] --> [B,E]
features = tf.squeeze(vectors, axis=1)
# 属性给定
self.features = tf.identity(features, "features")
def losses(self):
"""
损失函数的构建
:return:
"""
with tf.variable_scope("loss"):
# 0. 定义参数
weight = tf.get_variable(name="weight", shape=[self.vocab_size, self.embedding_size])
bias = tf.get_variable(name="bias", shape=[self.vocab_size])
def train_loss():
"""
训练阶段的损失函数构建
:return:
"""
_loss = tf.nn.nce_loss(
weights=weight, # 输出转换系数w,形状为: [V,E]
biases=bias, # 输出转换系数b,形状为: [V,]
labels=self.target, # 实际类别下标,形状为: [B,num_true], num_true表示每个样本存在几个预测标签
inputs=self.features, # 前向过程提取出来的特征信息,形状为: [B,E]
num_sampled=self.num_sampled, # 针对每个批次,会随机抽取多少个类别(负例)
num_classes=self.vocab_size, # 总类别数目,也就是单词词汇数目
num_true=self.window # 给定每个样本对应预测标签有多少个
)
_loss = tf.reduce_mean(_loss, name="train_loss")
return _loss
def eval_loss():
"""
数据验证时候的损失函数构建(不能够近似的计算,那么必须全连接)
:return:
"""
# 1. 做一个全连接操作,得到对应的logits值: [B,V]
logits = tf.nn.bias_add(tf.matmul(self.features, weight, transpose_b=True), bias=bias)
# 2. 对实际值进行哑编码操作(对应位置为1,存在多个位置为1)
labels = tf.one_hot(self.target, depth=self.vocab_size) # [B,T] --> [B,T,V]
labels = tf.reduce_sum(labels, axis=1) # [B,T,V] --> [B,V]
# 3. 损失函数构建(由于一个条件对应多个预测标签,那么损失函数的使用sigmoid交叉熵损失函数)
# TODO: 自己思考一下为什么使用sigmoid交叉熵损失函数,而不是softmax损失函数?
# softmax在计算样本属于当前类别概率值的时候,会使用到所有类别的置信度,各个类别之间是互斥的,所以softmax在训练的时候,只能够让其中一个类别的置信度最大,而其它类别的置信度减小
# sigmoid在计算概率的时候,是独立的,类别之间就没有互相影响,也就是允许存在一个样本对应多个类别的情况
_loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=labels, # 和logits的结构必须一致,一般为: [B,V]
logits=logits
)
_loss = tf.reduce_mean(_loss, name="eval_loss")
return _loss
# 1. 根据训练或者验证的参数,来获取对应的损失
loss = tf.cond(
pred=self.training, # 判断条件,Tensor对象
true_fn=train_loss, # 当pred为True的时候,返回true_fn这个参数的返回结果
false_fn=eval_loss # 当pred为False的时候,返回false_fn这个参数的返回结果
)
tf.summary.scalar('loss', loss)
# 2. 将变量的L2 Loss加入到损失函数中
l2_loss = tf.nn.l2_loss(self.embedding_table) + tf.nn.l2_loss(weight) + tf.nn.l2_loss(bias)
l2_loss = self.regularization * l2_loss
tf.summary.scalar('l2_loss', l2_loss)
# 3. 所有损失合并到一起
total_loss = loss + l2_loss
tf.summary.scalar('total_loss', total_loss)
return total_loss
def optimizer(self, loss):
"""
损失函数的构建
:param loss:
:return:
"""
with tf.variable_scope("train"):
# 1. 构建优化器
if self.optimizer_name == 'adam':
opt = tf.train.AdamOptimizer(
learning_rate=self.learning_rate,
beta1=self.adam_beta1,
beta2=self.adam_beta2,
epsilon=self.epsilon
)
elif self.optimizer_name == 'adadelta':
opt = tf.train.AdadeltaOptimizer(
learning_rate=self.learning_rate,
rho=self.adadelta_rho,
epsilon=self.epsilon
)
elif self.optimizer_name == 'adagrad':
opt = tf.train.AdagradOptimizer(learning_rate=self.learning_rate)
else:
opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
# 2. 训练对象构建
train_op = opt.minimize(loss=loss, global_step=self.global_step)
return opt, train_op
def metrics(self, loss=None):
"""
模型评估值的构建
:param loss:
:return:
"""
pass
def restore(self, session):
"""
模型参数恢复
:param session:
:return:
"""
# 0. 参数判断
if self.saver is None:
self.saver = tf.train.Saver()
# 1. 所有参数初始化
session.run(tf.global_variables_initializer())
# 2. 从checkpoint_dir文件夹中进行模型恢复操作
ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
tf.logging.info("Restore model weight from '{}'".format(ckpt.model_checkpoint_path))
self.saver.restore(session, save_path=ckpt.model_checkpoint_path)
self.saver.recover_last_checkpoints(ckpt.all_model_checkpoint_paths)
def save(self, session):
"""
模型持久化
:param session:
:return:
"""
# 0. 参数判断
if self.saver is None:
self.saver = tf.train.Saver()
# 1. 保存操作
tf.logging.info("Store the model weight to '{}'".format(self.checkpoint_path))
self.saver.save(session, save_path=self.checkpoint_path, global_step=self.global_step)
w2v
最新推荐文章于 2022-06-30 20:37:25 发布