-- coding: utf-8 --
“”"
@Time : 19-10-17 下午4:06
@Author : lei
@Site :
@File : Cheng.py
@Software: PyCharm
“”"
ai自己写成语
import tensorflow as tf
import collections
import numpy as np
成语数量 47247
字的个数 4908
定义提取数据类
class Input(object):
def init(self, file_path):
self.file_path = file_path
# 提取数据
def extract_data(self):
with tf.io.gfile.GFile(self.file_path, "r") as f:
data = f.read()
# 将提取到的数据按一个一个进行保存
list_data = data.strip().split()
# 数据处理,只分析4字成语
list_data = [data for data in list_data if len(data) <= 4]
return list_data
def collection_couter(self, data_list):
# 将所有单词都和在一起
sum_list = []
for data in data_list:
for da in data:
sum_list.append(da)
couter = collections.Counter(sum_list)
# 对输入的字用到的多少进行排序
couter_pairs = sorted(couter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*couter_pairs))
# zip 将两个列表合成字典 通过单词找id
word_to_id = dict(zip(words, range(len(words))))
# 通过id找单词
id_to_word = dict(zip(range(len(words)), words))
return word_to_id, id_to_word
# 将所有的单词转化为对应的id epoch该到多少轮
def word_id_batch(self, word_to_id, id_to_word, list_data, is_training):
all_id_list = []
for data in list_data:
one_id_list = []
for word in data:
if is_training is True:
one_id_list.append(word_to_id[word])
elif is_training is False:
one_id_list.append(id_to_word[word])
all_id_list.append(one_id_list)
return all_id_list
def run(self, is_training):
list_data = self.extract_data()
# print(len(list_data))
# 将每个字转换为 编码 所以需要先进行统计字的数量
word_to_id, id_to_word = self.collection_couter(list_data)
# print(word_to_id)
# 将每个字转换为编码
all_id_list = self.word_id_batch(word_to_id, id_to_word, list_data, is_training)
x_all = [data[:3] for data in all_id_list]
y_all = [data[1:] for data in all_id_list]
return x_all, y_all
定义模型,用于训练
class Model(object):
def init(self, file_path, is_training, keepdrop):
# 3组循环层
self.num_steps = 3
# 每组有3层
self.layers = 3
# 一次提取的数据量
self.batch_size = 50
self.epochs = 47247 // self.batch_size
self.init_scale = 0.1
# 字的数量
self.word_size = 4908
# 词向量的大小
self.v_word_size = 650
# 学习率
self.learning = 0.01
# 制作词向量
input = Input(file_path)
x_all, y_all = input.run(True)
# print(y_all)
y_batch = []
for y_set in y_all:
y_list = []
for y in y_set:
y = tf.one_hot(y, depth=4908)
y_list.append(y)
y_batch.append(y_list)
x = tf.placeholder(tf.int32, [None, 3])
y_true = tf.placeholder(tf.int32, [None, 3])
# print(y_true)
with tf.name_scope("data"):
# 将y_true转换为tensor类型
y_true = tf.convert_to_tensor(y_true)
# 转换为词向量 缩小词的大小 inputs 转换为词向量后的结果
x_embedding = tf.Variable(tf.random_uniform([self.word_size, self.v_word_size], self.init_scale, -self.init_scale))
# Tensor("rnn/transpose_1:0", shape=(47247, 3, 650), dtype=float32)
inputs = tf.nn.embedding_lookup(x_embedding, x)
# print(inputs)
# 如果在训练则进行dropout层
if is_training is True:
inputs = tf.nn.dropout(inputs, keepdrop)
with tf.name_scope("rnn"):
# 进行rnn循环神经网络的测试 每一层的神经元个数为650
cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.v_word_size)
# 如果是训练,则进行dropout
tf.nn.rnn_cell.DropoutWrapper(cell, keepdrop)
# 构建三层
muilt_cells = tf.nn.rnn_cell.MultiRNNCell([cell for i in range(3)])
# 将每一层进行初始化
initial_state = muilt_cells.zero_state(batch_size=47247, dtype=tf.float32)
# 进行动态操作
outputs, status = tf.nn.dynamic_rnn(muilt_cells, inputs, initial_state=initial_state, dtype=tf.float32)
# print(outputs)
with tf.name_scope("correct"):
# 进行隐藏层的改变
h_outputs = tf.reshape(outputs, [-1, 650])
softmax_x = tf.Variable(tf.truncated_normal([650, 4908], stddev=0.1))
softmax_y = tf.Variable(tf.truncated_normal([4908], stddev=0.1))
logits = tf.nn.xw_plus_b(h_outputs, softmax_x, softmax_y)
# 再将logits转换为三维
logits = tf.reshape(logits, [47247, 3, -1])
# print(logits)
# print(y_true)
# 进行交叉熵验证
with tf.name_scope("loss"):
x_entroxy = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
loss = tf.reduce_mean(x_entroxy)
with tf.name_scope("train_step"):
train_step = tf.train.AdamOptimizer(self.learning).minimize(loss)
with tf.name_scope("acc"):
equal = tf.equal(tf.argmax(y_true, 1), tf.argmax(logits, 1))
acc = tf.reduce_mean(tf.cast(equal, tf.float32))
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
# 制作线程协调器
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess, coord=coord)
for i in range(50):
for j in range(self.epochs):
sess.run(train_step, feed_dict={x: x_all[j*self.batch_size: j*(self.batch_size+1)], y_true: y_batch[j*self.batch_size: j*(self.batch_size+1)]})
print("第{}轮,acc:{}".format(i+1, acc))
# 关闭多线程
coord.request_stop()
coord.join(threads)
if name == “main”:
file_path = “./ChengYu.txt”
mod = Model(file_path, True, 0.5)