机器人自己产生成语

-- coding: utf-8 --

“”"
@Time : 19-10-17 下午4:06
@Author : lei
@Site :
@File : Cheng.py
@Software: PyCharm
“”"

ai自己写成语

import tensorflow as tf
import collections
import numpy as np

成语数量 47247

字的个数 4908

定义提取数据类

class Input(object):
def init(self, file_path):
self.file_path = file_path

# 提取数据
def extract_data(self):
    with tf.io.gfile.GFile(self.file_path, "r") as f:
        data = f.read()
    # 将提取到的数据按一个一个进行保存
    list_data = data.strip().split()
    # 数据处理,只分析4字成语
    list_data = [data for data in list_data if len(data) <= 4]
    return list_data

def collection_couter(self, data_list):
    # 将所有单词都和在一起
    sum_list = []
    for data in data_list:
        for da in data:
            sum_list.append(da)
    couter = collections.Counter(sum_list)
    # 对输入的字用到的多少进行排序
    couter_pairs = sorted(couter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*couter_pairs))
    # zip 将两个列表合成字典  通过单词找id
    word_to_id = dict(zip(words, range(len(words))))
    # 通过id找单词
    id_to_word = dict(zip(range(len(words)), words))

    return word_to_id, id_to_word

# 将所有的单词转化为对应的id epoch该到多少轮
def word_id_batch(self, word_to_id, id_to_word, list_data, is_training):
    all_id_list = []
    for data in list_data:
        one_id_list = []
        for word in data:
            if is_training is True:
                one_id_list.append(word_to_id[word])
            elif is_training is False:
                one_id_list.append(id_to_word[word])
        all_id_list.append(one_id_list)
    return all_id_list

def run(self, is_training):
    list_data = self.extract_data()
    # print(len(list_data))
    # 将每个字转换为 编码  所以需要先进行统计字的数量
    word_to_id, id_to_word = self.collection_couter(list_data)
    # print(word_to_id)
    # 将每个字转换为编码
    all_id_list = self.word_id_batch(word_to_id, id_to_word, list_data, is_training)
    x_all = [data[:3] for data in all_id_list]
    y_all = [data[1:] for data in all_id_list]
    return x_all, y_all

定义模型,用于训练

class Model(object):
def init(self, file_path, is_training, keepdrop):
# 3组循环层
self.num_steps = 3
# 每组有3层
self.layers = 3
# 一次提取的数据量
self.batch_size = 50
self.epochs = 47247 // self.batch_size
self.init_scale = 0.1
# 字的数量
self.word_size = 4908
# 词向量的大小
self.v_word_size = 650
# 学习率
self.learning = 0.01

    # 制作词向量
    input = Input(file_path)
    x_all, y_all = input.run(True)
    # print(y_all)
    y_batch = []
    for y_set in y_all:
        y_list = []
        for y in y_set:
            y = tf.one_hot(y, depth=4908)
            y_list.append(y)
        y_batch.append(y_list)

    x = tf.placeholder(tf.int32, [None, 3])
    y_true = tf.placeholder(tf.int32, [None, 3])

    # print(y_true)
    with tf.name_scope("data"):
        # 将y_true转换为tensor类型
        y_true = tf.convert_to_tensor(y_true)

        # 转换为词向量  缩小词的大小  inputs 转换为词向量后的结果
        x_embedding = tf.Variable(tf.random_uniform([self.word_size, self.v_word_size], self.init_scale, -self.init_scale))
        # Tensor("rnn/transpose_1:0", shape=(47247, 3, 650), dtype=float32)
        inputs = tf.nn.embedding_lookup(x_embedding, x)
        # print(inputs)

        # 如果在训练则进行dropout层
        if is_training is True:
            inputs = tf.nn.dropout(inputs, keepdrop)

    with tf.name_scope("rnn"):
        # 进行rnn循环神经网络的测试  每一层的神经元个数为650
        cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.v_word_size)
        # 如果是训练,则进行dropout
        tf.nn.rnn_cell.DropoutWrapper(cell, keepdrop)
        # 构建三层
        muilt_cells = tf.nn.rnn_cell.MultiRNNCell([cell for i in range(3)])
        # 将每一层进行初始化
        initial_state = muilt_cells.zero_state(batch_size=47247, dtype=tf.float32)
        # 进行动态操作
        outputs, status = tf.nn.dynamic_rnn(muilt_cells, inputs, initial_state=initial_state, dtype=tf.float32)
        # print(outputs)

    with tf.name_scope("correct"):
        # 进行隐藏层的改变
        h_outputs = tf.reshape(outputs, [-1, 650])
        softmax_x = tf.Variable(tf.truncated_normal([650, 4908], stddev=0.1))
        softmax_y = tf.Variable(tf.truncated_normal([4908], stddev=0.1))

        logits = tf.nn.xw_plus_b(h_outputs, softmax_x, softmax_y)
        # 再将logits转换为三维
        logits = tf.reshape(logits, [47247, 3, -1])
        # print(logits)
        # print(y_true)

    # 进行交叉熵验证
    with tf.name_scope("loss"):
        x_entroxy = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=logits)
        loss = tf.reduce_mean(x_entroxy)

    with tf.name_scope("train_step"):
        train_step = tf.train.AdamOptimizer(self.learning).minimize(loss)

    with tf.name_scope("acc"):
        equal = tf.equal(tf.argmax(y_true, 1), tf.argmax(logits, 1))
        acc = tf.reduce_mean(tf.cast(equal, tf.float32))

    init_op = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init_op)

        # 制作线程协调器
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess, coord=coord)

        for i in range(50):
            for j in range(self.epochs):
                sess.run(train_step, feed_dict={x: x_all[j*self.batch_size: j*(self.batch_size+1)], y_true: y_batch[j*self.batch_size: j*(self.batch_size+1)]})
                print("第{}轮,acc:{}".format(i+1, acc))
        # 关闭多线程
        coord.request_stop()
        coord.join(threads)

if name == “main”:
file_path = “./ChengYu.txt”
mod = Model(file_path, True, 0.5)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值