NLP学习笔记——情感分析实战（情感分类）

唯有读书高！

已于 2023-11-21 14:49:14 修改

阅读量1.3k

点赞数 3

分类专栏： NLP学习文章标签：自然语言处理学习笔记

于 2023-07-22 16:35:46 首次发布

本文链接：https://blog.csdn.net/qq_57832544/article/details/131869393

版权

NLP学习专栏收录该内容

7 篇文章 8 订阅

订阅专栏

相关知识自行了解，上代码，代码是好久之前在大学写的了，可能有点乱，工作之后没时间优化了，但模块功能绝对没问题，报错的话应该是库的不兼容产生的，我忘记原来的库版本了，python用的是3.8。

all_param.py

word2vec_size = 768         # 词向量维度

max_len = 250               # 最大句子长度

batch_size = 16             # 一次训练批数

head_num = 8                # 多头个数, 必须小于词向量维度，（head_dim=word2vec_size//head_num）

transformer_layer = 1               # 编码器（解码器）层数

class_num = 2               # 分类的类别数

learning_rate = 1e-5       # 学习率

steps = 10                   # 训练次数

Train = True           # 是否选择训练模式，True为训练模式， False为预测模式

cnn_layer = 3         # CNN层数

kernel_num = 32         # 卷积核个数

一、Transformer-textCNN

data2vector.py 这里比较乱，不同数据不同处理方法，大家尽量自己写

import numpy as np
from all_param import *

def word2vec_index(file_path):
    """
    :param file_path: 词向量文件路径
    :return word2vector: 字到向量的字典
    :return word2index: 字到词袋表示的字典
    :return index2word: 词袋表示到字的字典
    """
    word2vector = {}
    word2index = {}
    index2word = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        index = 1
        data = file.readlines()[1:]
        for line in data:
            line = line.replace('\n', '')
            line = line.split(' ')
            word = line[0]
            vector = np.array(line[1:], dtype=float)
            #建立索引
            word2vector[word] = vector
            word2index[word] = index
            index2word[index] = word
            index +=1

        # 加入填充符
        word2vector['<pad>'] = np.zeros(shape=(word2vec_size))
        word2index['<pad>'] = 0
        index2word[0] = '<pad>'

    return word2vector, word2index, index2word


def data_processing(path, data_len, word2vector, word2index, data_batch, data_start_site):
    """
    :param path: 数据集路径
    :param data_len: 数据数
    :param word2vector: 转词向量字典
    :param word2index: 转词词袋表示字典
    :param data_batch: 一次取的数据数
    :param data_start_site: 开始取的数据位置
    :return comment2vector: 评论向量表示
    :return comment2index: 评论词袋表示
    :return labels: 标签（独热编码）
    """
    with open(path, 'r', encoding='utf-8') as file1:
        data = file1.readlines()
        if data_start_site + data_batch > data_len: # 选取数据下标超出列表的长度但小于所取的数据批数时
            end_site = data_start_site + data_batch - data_len   # 应取数据的末尾位置
            data = data[data_start_site:] + data[:end_site]
        else:
            end_site = data_start_site + data_batch       # 应取数据的末尾位置
            data = data[data_start_site:end_site]
    file1.close()
    #初始化向量空间和词袋空间
    comment2vector = np.zeros(shape=(len(data), max_len, word2vec_size))
    comment2index = np.zeros(shape=(len(data), max_len))
    labels = np.zeros(shape=(len(data), class_num), dtype=float)
    #遍历每一条评论
    for i in range(len(data)):
        comment = data[i][2:]   # 获取评论
        comment = comment.replace('\n', '')
        comment = comment.split(' ')
        comment = [i for i in comment if i !='']    # 去除列表里所有空元素

        for word in range(max_len):    #对评论进行数值转换
            if word > len(comment) - 1:                        #评论长度短需要填充时
                continue
            else:                                           #正常数值转换时
                comment2vector[i][word] = word2vector[comment[word]]   #向量转换
                comment2index[i][word] = word2index[comment[word]]     #词袋转换

        label = int(data[i][:1])  # 获取标签
        # 独热编码
        labels[i][label] = 1

        # 标签平滑
        for zero in range(len(labels[i])):
            if labels[i][zero] == 0:
                labels[i][zero] = 0.0000001
            else:
                labels[i][zero] = 0.9999999
    return comment2vector, comment2index, labels





if __name__ == '__main__':
    word2vector, word2index, index2word = word2vec_index(
        'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')  # 加载词向量
    # 获取数据集个数
    with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file1:
        datas_len = len(file1.readlines())
    file1.close()
    print('一共有{}条数据'.format(datas_len))

    # 分批次输入数据集
    #batch_num = datas_len // batch_size  # 可分的批次数
    batch_num = 1
    for i in range(batch_num+1):
        comment_vector, comment_index, labels = data_processing(
            'data_set/douban_comment/balanced/balanced_train.txt', datas_len,word2vector, word2index, batch_size, i * batch_size)
        print(labels)

block_transformer.py

"""
定义transformer模块
"""
from all_param import *
from tensorflow import keras
import numpy as np
import tensorflow as tf

class transformer(keras.Model):
    def __init__(self, max_len, word_dim, head_num, class_num, learning_rate, Train):
        super(transformer, self).__init__()
        self.Train = Train
        self.pe = self.positional_encoding(word_dim,max_len)                        # 位置编码
        self.head_dim = word_dim // head_num              # 分头后的维度
        # Q、K、V矩阵   kernel_initializer='RandomUniform'
        self.Wq = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        self.Wk = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        self.Wv = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        # 前馈神经网络
        self.feed_forward_network = keras.layers.Dense(word_dim * head_num,kernel_initializer='RandomUniform',
                                             activation=keras.activations.relu)
        self.adjust_shape = [keras.layers.Dense(word_dim,kernel_initializer='RandomUniform') for _ in range(2)]    # 调整多头注意力输出张量形状
        self.drop = [keras.layers.Dropout(rate=learning_rate) for _ in range(2)]      # 防止过拟合，让神经元以rate的概率停止工作
        self.layer_norm = [keras.layers.LayerNormalization(axis=-1) for _ in range(2)]  # Norm
        self.linear = keras.layers.Dense(class_num,kernel_initializer='RandomUniform')     # 初始化全连接层（linear层）

    # 位置编码
    def positional_encoding(self,word_dim,max_len):
        """
        :return pe: 位置编码
        """
        # 初始化变量pos和i
        pos = np.array([[i for i in range(max_len)]]).T
        I = np.array([[i if i%2==0 else (i-1) for i in range(word_dim)]])

        # 公式计算
        pe = pos / np.power(10000, I/word_dim)
        pe[:, 0::2] = np.sin(pe[:, 0::2])
        pe[:, 1::2] = np.cos(pe[:, 1::2])

        return pe

    # 多头注意力机制
    def multi_head_attention(self,x_embedding, x_index, this_layer):
        """
        :param x_embedding: 词向量表示
        :return output: 含注意力信息的词向量
        """
        # 公式计算
        q,k,v = self.Wq(x_embedding), self.Wk(x_embedding), self.Wv(x_embedding)
        h_q = tf.reshape(q, (q.shape[0], head_num, q.shape[1], self.head_dim))   # 分头
        h_k = tf.reshape(k, (k.shape[0], head_num, k.shape[1], self.head_dim))
        h_v = tf.reshape(v, (v.shape[0], head_num, v.shape[1], self.head_dim))
        dk = h_q.shape[-1]

        attention = tf.matmul(h_q, h_k, transpose_b=True) / np.sqrt(dk)   # 未加掩码的注意力
        attention_mask = self.mask(x_index)
        # 加入掩码
        attention += attention_mask * -1e10           # 使要遮掩的位置的注意力为负无穷大
        self.attention = tf.nn.softmax(attention, axis=-1)  # 经过softmax后需要遮掩位置的注意力为无限接近0
        att_massage = tf.matmul(self.attention, h_v)   # 获得通过注意力表示的词向量

        # 输出数据形状调整
        att_massage = tf.transpose(att_massage, perm=[0, 2, 1, 3])    #  为了方便下一步降维，将head_num和head_dim整合成word_dim
        att_massage = tf.reshape(att_massage, (att_massage.shape[0], att_massage.shape[1], -1))
        output = self.adjust_shape[0](att_massage)     # 词向量形状规范化，head_num * head_dim不一定等于word_dim
        output = self.drop[0](output, training=self.Train)

        return output

    # 多头注意力机制里的掩码
    def mask(self, x_index):
        """
        :param x_index: 词袋表示
        :return word_mask: 填充符向量掩码
        :return attention_mask:  注意力掩码
        """
        mask = tf.math.equal(x_index, np.zeros(shape=x_index.shape))  # 找到需要遮掩的元素位置，值为True
        attention_mask = mask[:, np.newaxis, np.newaxis, :]
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)  # 获得词向量填充符掩码

        return attention_mask

    # 前馈神经网络
    def feed_forward(self,attention, this_layer):
        """
        :param attention: 含注意力信息的词向量
        :return output:  调整后的词向量
        """
        # 数据输入计算
        output = self.feed_forward_network(attention)
        output = self.adjust_shape[1](output)
        output = self.drop[1](output, training=self.Train)

        return output

    # 编码器层
    def encoder_layer(self, x_embedding, x_index, this_layer):
        """
        :param x_embedding: 含位置编码的词向量表示
        :param x_index:  词袋表示
        :param this_layer: 编码器层
        :return:
        """
        x_attention = self.layer_norm[0](x_embedding)                         # Norm (layerNorm)
        x_attention = self.multi_head_attention(x_attention, x_index, this_layer)  # 多头注意力机制
        x_attention += x_embedding                                                 # Add

        x_message = self.layer_norm[1](x_attention)                         # Norm (layerNorm)
        x_message = self.feed_forward(x_message, this_layer)                     # 前馈神经网络
        x_message += x_attention                                                   # Add

        return x_message

    # 整个编码器模块
    def encoder(self, x_embedding, x_index, layer_num):
        """
        :param x_embedding: 含位置编码的词向量表示
        :param x_index:  词袋表示
        :return x_message: 编码器提取到的信息
        """
        # 各个模块组成编码器
        x_message = x_embedding
        for i in range(layer_num):                                  # encoder的个数
            x_message = self.encoder_layer(x_message, x_index, i)

        return x_message

    # 整个transformer模型
    def calls(self, x_vector, x_index, layer_num):
        """
        :param x_vector: 词向量表示
        :param x_index: 词袋表示
        :return: 预测类别的概率
        """
        # 各个模块拼接成transformer
        x_embedding = x_vector + self.pe                 # 位置编码嵌入
        scores = self.encoder(x_embedding, x_index, layer_num) # 编码器
        #scores = tf.reduce_mean(scores, axis=1)     # 降维形成句向量，去掉max_len维度
        #scores = self.linear(scores)                     # 全链接，实现类别数值的计算[batch_size,class_num]
        #scores = tf.math.softmax(scores, axis=-1)        # 获得类别概率

        return scores


if __name__ == '__main__':
    """
    测试transformer能不能正常使用
    """
    import os
    import data2vector
    import pickle
    import time

    # 训练
    def train(model, data_path, batch_size, steps, word2vector, word2index, class_num, layer_num,
              cross_entropy,optimizer,save_path,writing_mode):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集，分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            #data_copies = 10

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=((data_copies + 1) * batch_size, class_num))
            all_labels = np.zeros(shape=((data_copies + 1) * batch_size, class_num))

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    scores = model.calls(x_vector, x_index,layer_num)  # 获取模型预测值
                    loss = cross_entropy(labels, scores)  # 计算交叉熵损失
                derivative = tape.gradient(loss, model.trainable_variables)  # 自动求导
                optimizer.apply_gradients(zip(derivative, model.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据，第 {:3} 批数据,当前损失： {:4f} '.format(data_copies,i, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time                  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签（0或1）
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
                # 保存和打印
                look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=np.array(all_loss).mean(), all_time=all_time)
                writing_mode = 'a'

    # 测试
    def test(model, data_path, batch_size,layer_num,class_num,save_path,writing_mode):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        model.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=((batch_num) * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            scores = model.calls(x_vector, x_index,layer_num)  # 获取模型预测值
            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签（0或1）
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num+1), end='')

        times = time.time() - start_time
        look_and_save_data(model,results, times,save_path,writing_mode)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测，标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            model.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


    # 初始化交叉熵和优化器
    cross_entropy = keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam()
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/cg'
    # 模型参数初始化
    model = transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)

    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
            word2vector,word2index, class_num, transformer_layer,cross_entropy,optimizer,save_path,writing_mode)

    else:  # 测试模型
        # 模型参数初始化
        test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size, transformer_layer,
                   class_num,save_path,writing_mode)

block_CNN.py

import tensorflow as tf
from all_param import *
import numpy as np

class TextCNN(tf.keras.Model):
    def __init__(self, word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train):
        super(TextCNN, self).__init__()
        self.Train = Train
        # 初始化第一层卷积核大小分别为（2，embed_dim），（3，embed_dim），（4，embed_dim）的卷积层
        self.conv = [tf.keras.layers.Conv2D(kernel_num, (i,word2vec_size), strides=(1,1), padding='valid',
                                 kernel_initializer='RandomUniform', activation='relu') for i in range(2,5)]
        self.max_pool = tf.keras.layers.MaxPool1D(pool_size=2,padding='same')
        self.drop = tf.keras.layers.Dropout(rate=learning_rate)
        # self.line = tf.keras.layers.Dense(512, kernel_initializer='RandomUniform')  # 初始化全连接层
        self.line0 = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')  # 初始化全连接层
        self.line1 = tf.keras.layers.Dense(class_num,kernel_initializer='RandomUniform')   # 初始化全连接层

        # 后续的深层卷积层
        if cnn_layer>1:
            self.conv_add = [tf.keras.layers.Conv1D(tf.math.pow(2, i + 1) * kernel_num, 2, strides=1,
             kernel_initializer='RandomUniform', activation='relu',padding='valid') for i in range(cnn_layer - 1)]


    # 一个cnn结构
    def conv_and_pool(self,input,conv):
        """
        :param input: 输入数据
        :param conv: 卷积层
        :return:
        """
        data = conv(input)            # 卷积  [batch, max_len-1, 1, kernel_num]
        data = tf.reshape(data, (data.shape[0], data.shape[1], -1)) # 降维[batch, max_len-1, kernel_num]
        data = self.max_pool(data)    # 池化  [batch, (max_len-1)/2, kernel_num]
        if cnn_layer > 1:                                # 进入深度卷积层
            for this_layer in range(cnn_layer-1):      # 例如第二层卷积数据形状
                data = self.conv_add[this_layer](data) # 卷积  [batch, (max_len-1)/2-1, kernel_num]
                data = self.max_pool(data)             # 池化 [batch, ((max_len-1)/2-1)/2, kernel_num]

        data = tf.reshape(data, (data.shape[0], -1))   # 展开最后一维进行降维

        return data

    # 用上2，3，4这三个cnn
    def calls(self, input):
        """
        :param input: 输入数据
        :return:
        """
        datas = []
        # 获取三个cnn的结果
        for i in range(len(self.conv)):
            data = self.conv_and_pool(input,self.conv[i])
            datas.append(data)
        # 将结果进行拼接
        for i in range(1,len(datas)):
            datas[0] = tf.concat((datas[0],datas[i]),1)

        output = self.drop(datas[0],training=self.Train)    # 防止过拟合
        output = self.line0(output)
        #output = self.line1(output)     # 全连接
        #output = tf.math.softmax(output, axis=-1)  # 获得类别概率

        return output



if __name__=='__main__':
    """
    测试CNN能不能正常使用
    """
    import os
    import data2vector
    import pickle
    import time

    # 训练
    def train(model, data_path, batch_size, steps, word2vector, word2index, index2word, class_num,
              cross_entropy,optimizer,save_path,writing_mode):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集，分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            #data_copies = 80

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=(data_copies * batch_size, class_num))
            all_labels = np.zeros(shape=(data_copies * batch_size, class_num))

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    x_vector = x_vector[:,:,:,np.newaxis]
                    scores = model.calls(x_vector)  # 获取模型预测值
                    loss = cross_entropy(labels, scores)  # 计算交叉熵损失
                derivative = tape.gradient(loss, model.trainable_variables)  # 自动求导
                optimizer.apply_gradients(zip(derivative, model.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据，第 {:3} 批数据,当前损失： {:4f} '.format(data_copies,i+1, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time                  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签（0或1）
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
                mean_loss = np.array(all_loss).mean()                # 平均损失

                look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=mean_loss, all_time=all_time) # 保存和打印
                writing_mode = 'a'

    # 测试
    def test(model, data_path, batch_size,class_num,save_path,writing_mode):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        model.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=(batch_num * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            x_vector = x_vector[:, :, :, np.newaxis]
            scores = model.calls(x_vector)  # 获取模型预测值

            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签（0或1）
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num), end='')

        times = time.time() - start_time
        look_and_save_data(model,results, times,save_path,writing_mode)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测，标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                                                                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            model.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


    # 初始化交叉熵和优化器
    cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam()
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/balanced_CNN_4_128'
    # 模型参数初始化
    model = TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)

    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
            word2vector,word2index, index2word, class_num,cross_entropy,optimizer,save_path,writing_mode)

    else:  # 测试模型
        # 模型参数初始化
        test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size,
                   class_num,save_path,writing_mode)

transformer-textCNN.py

import os
import pickle
from all_param import *
import data2vector
import numpy as np
import tensorflow as tf
import time

import block_CNN,block_transformer

class TransformerCNN(tf.keras.Model):
    def __init__(self,max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer):
        super(TransformerCNN, self).__init__()
        self.transformer = block_transformer.transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)
        self.CNN = block_CNN.TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)
        self.Train = Train

        # 初始化交叉熵和优化器

        self.cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
        self.optimizer = tf.keras.optimizers.Adam()
        self.tf_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
        self.cnn_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
        self.linear = tf.keras.layers.Dense(class_num, kernel_initializer='RandomUniform')  # 初始化全连接层（linear层）
        self.writing_mode = 'w'  # 初始写入模式为覆盖
        self.attention = None

    # 训练
    def train(self, data_path, batch_size, steps, word2vector, word2index, class_num, save_path, layer_num):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集，分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            # data_copies = 10

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=(data_copies * batch_size, class_num),)
            all_labels = np.zeros(shape=(data_copies * batch_size, class_num), dtype=float)

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    scores1 = self.transformer.calls(x_vector, x_index, layer_num)  # 获取transformer模型预测值
                    scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis])  # 获取cnn模型预测值
                    scores = self.linear(scores2)  # 全连接
                    scores = tf.math.softmax(scores, axis=-1)                       # 获得类别概率
                    loss = self.cross_entropy(labels, scores)                       # 计算交叉熵损失
                derivative = tape.gradient(loss, self.trainable_variables)  # 自动求导
                self.optimizer.apply_gradients(zip(derivative, self.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据，第 {:3} 批数据,当前损失： {:4f} '.format(data_copies, i, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签（0或1）
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
                mean_loss = np.array(all_loss).mean()

                self.look_and_save_data(result, this_time, save_path, word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=mean_loss, all_time=all_time)  # 保存和打印
                self.writing_mode = 'a'

    # 测试
    def test(self, data_path, batch_size, layer_num, class_num, save_path):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        self.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=(batch_num * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            scores1 = self.transformer.calls(x_vector, x_index, layer_num)  # 获取transformer模型预测值
            scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis])  # 获取cnn模型预测值
            scores = self.linear(scores2)  # 全连接
            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签（0或1）
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]

            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i + 1, batch_num + 1), end='')

        times = time.time() - start_time
        self.look_and_save_data(results, times,save_path)

    # 打印和保存训练过程或预测结果
    def look_and_save_data(self, result, this_time, save_path, word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测，标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)
        os.makedirs(save_path, exist_ok=True)
        # 输出并保存结果
        if self.Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', self.writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            self.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', self.writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


if __name__ == '__main__':


    # 模型参数初始化
    model = TransformerCNN(max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer)
    save_path = './model_data/balanced_RU_1_64_CNN_4_64_label'
    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        model.train('data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps, word2vector, word2index,
                    class_num, save_path,transformer_layer)

    else:  # 测试模型
        # 模型参数初始化
        model.test('data_set/douban_comment/balanced/balanced_test.txt',batch_size, transformer_layer, class_num, save_path)

二、BERT-textCNN

get_data.py

import numpy as np

# 数据处理和提取
def get_input(path, data_num, data_batch, data_start_site):
    # 读取对应批数的数据
    with open(path, 'r', encoding='utf-8') as file1:
        data = file1.readlines()
        if data_start_site + data_batch > data_num:  # 选取数据下标超出列表的长度但小于所取的数据批数时
            end_site = data_start_site + data_batch - data_num  # 应取数据的末尾位置
            data = data[data_start_site:] + data[:end_site]
        else:
            end_site = data_start_site + data_batch  # 应取数据的末尾位置
            data = data[data_start_site:end_site]
    file1.close()

    labels = np.zeros(shape=(len(data)))
    comments = []
    # 数据处理
    for i in range(len(data)):
        one_data = data[i].replace('\n', '')
        one_data = one_data.split(' ')
        label, comment = int(one_data[0]) ,one_data[1:]

        if label != 0 and label != 1:  # 如果标签不存在，舍弃这条数据
            labels[i] = 0
        else:
            if label == 0:
                labels[i] = 0.001
            if label == 1:
                labels[i] = 1.001

            comments.append(''.join(comment))



    return labels, comments

if __name__ == '__main__':
    with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file:
        data_len = len(file.readlines())
    file.close()

    labels, comments = get_input('data_set/douban_comment/balanced/balanced_train.txt', data_len, 10, 0)
    print(labels)

bert_torch.py

from transformers import BertModel, BertTokenizer
import torch

#print(torch.cuda.is_available())        # 查看GPU是否可用
#print(torch.cuda.device_count())        # 查看GPU数量
#print(torch.cuda.current_device())      # 查看GPU索引号
#print(torch.cuda.get_device_name(0))    # 根据索引号得到GPU名称


class bert(torch.nn.Module):
    def __init__(self):
        super(bert, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')  # Bert分词器
        self.BERT = BertModel.from_pretrained('hfl/chinese-bert-wwm') # Bert模型,放GPU上

    def calls(self,input_list):
        batch_tokenized = self.tokenizer.batch_encode_plus(input_list, add_special_tokens=True,
                                                           max_length=max_len, padding='max_length',
                                                           truncation=True)

        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        #with torch.no_grad():
        hidden_outputs = self.BERT(input_ids, attention_mask=attention_mask)
        outputs = hidden_outputs[0]  # [0]表示输出结果(last_hidden_state部分)，[:,0,:]表示[CLS]对应的结果
        cls = outputs[:, 0, :]
        return outputs, cls


if __name__ == '__main__':
    import get_data
    import numpy as np
    import os
    import time
    from  all_param import *

    def train(BERT, data_path, epoch, batch_size, class_num, optimizer, line, cross_entropy, save_path, writing_mode, Train):
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        # 训练
        all_time_start = time.time()
        torch.cuda.empty_cache()
        for e in range(epoch):
            this_time_start = time.time()  # 起始时间
            batch_num = datas_len // batch_size  # 可取的批数
            batch_num = 2

            all_loss = []
            all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
            all_labels = torch.tensor(np.zeros(shape=(1)), dtype=torch.float32)

            # 批训练
            for batch in range(batch_num):
                # 获取数据
                labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)

                labels = torch.tensor(labels, dtype=torch.float32).long()

                optimizer.zero_grad()  # 1.梯度置零
                _, cls = BERT.calls(comments)  # 2.模型获得结果
                cls = line(cls)
                #cls = torch.softmax(cls, dim=-1)
                loss = cross_entropy(cls, labels)  # 3.计算损失

                loss.requires_grad_(True)
                loss.backward()  # 4.反向传播
                optimizer.step()  # 5.修改参数，w，b
                print('\r共有{}批数据，第 {:3} 批数据,当前损失： {:4f} '.format(batch_num, batch, loss), end='')

                ## 记录遍历一遍数据的总结果
                all_loss.append(loss.item())  # item()返回loss的值
                all_outputs = torch.cat((all_outputs, cls), dim=0)
                all_labels = torch.cat((all_labels, labels), dim=0)

            # 打印并保存本次训练结果
            if e % 1 == 0:
                this_time = time.time() - this_time_start  # 本次耗时
                all_time = time.time() - all_time_start   # 当前总耗时
                predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签（0或1）
                actual_value = all_labels[1:].detach().numpy()[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
                look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train, step=e,
                                        loss=np.array(all_loss).mean(), all_time=all_time)
                writing_mode = 'a'  # 更改写入模式为追加

    def test(BERT, data_path, batch_size, class_num, save_path, writing_mode, Train):
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        BERT.load_state_dict(torch.load(save_path+"/model.ckpt"))
        BERT.eval()
        this_time_start = time.time()  # 起始时间
        batch_num = datas_len // batch_size  # 可取的批数
        all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
        all_labels = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)

        # 批训练
        for batch in range(batch_num):
            # 获取数据
            labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)

            labels = torch.tensor(labels, dtype=torch.float32)

            outputs, cls = BERT.call(comments)  # 2.模型获得结果
            cls = line(cls)
            cls = torch.softmax(cls, dim=-1)

            # 记录遍历一遍数据的总结果
            all_outputs = torch.cat((all_outputs, cls), dim=0)
            all_labels = torch.cat((all_labels, labels), dim=0)
            print('\r共有{}批数据， 第 {:3} 批数据'.format(batch_num, batch+1), end='')

        this_time = time.time() - this_time_start  # 本次耗时
        predict_value = np.argmax(all_outputs[1:], axis=-1)[:, None]  # 预测标签（0或1）
        actual_value = np.argmax(all_labels[1:], axis=-1)[:, None]  # 实际标签
        result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
        look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode, Train,
                           step=None, loss=None, all_time=None):
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time / 60, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            torch.save(model.state_dict(), save_path+"/model.ckpt")

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
            file.close()

    # 初始化交叉熵和优化器
    bert = bert()
    line = torch.nn.Linear(768, class_num)
    cross_entropy = torch.nn.CrossEntropyLoss()  # 定义损失函数，交叉熵损失函数
    optimizer = torch.optim.Adam(bert.parameters(),lr=learning_rate)
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/cg'
    # 模型参数初始化

    if Train == True:  # 模型训练
        train(bert, 'data_set/douban_comment/balanced/balanced_train.txt', steps, batch_size, class_num,
              optimizer, line, cross_entropy, save_path, writing_mode, Train)

    else:  # 测试模型
        # 模型参数初始化
        test(bert, 'data_set/douban_comment/balanced/balanced_test.txt', batch_size, class_num,
             save_path, writing_mode, Train)

bert_textCNN.py

import os
import time
import numpy as np
import get_data
from all_param import *
import bert_torch
import torch
import math
#from transformers import BertTokenizer, BertModel

class TextCNN(torch.nn.Module):
    def __init__(self,embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE):
        super(TextCNN, self).__init__()
        # 初始化第一层卷积核大小分别为（2，embed_dim），（3，embed_dim），（4，embed_dim）的卷积层
        self.conv = [torch.nn.Conv2d(1,kernel_num,(i,embed_dim)).to(DEVICE) for i in range(2,5)]

        self.relu = torch.nn.ReLU()
        self.max_pool = torch.nn.MaxPool1d(2,ceil_mode=True)  # 最大池化层
        self.drop = torch.nn.Dropout(learning_rate)


        # 后续的深层卷积层
        if cnn_layer>1:
            self.conv_add = [torch.nn.Conv1d(int(math.pow(2, i)) * kernel_num,
                                          2 *int(math.pow(2, i)) * kernel_num, 2).to(DEVICE) for i in range(cnn_layer-1)]

        # 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
        line_dim = max_len / 2                      # 由第一层池化操作得到的
        if cnn_layer > 1:
            for i in range(cnn_layer - 1):          # 第二层到第cnn_layer层
                if i%2==0:                          # 偶数层刚好卷积后全部池化
                    line_dim = int((line_dim - 1) / 2)
                if i%2==1:                          # 奇数层卷积后会剩一个没池化到，便多池化一次
                    line_dim = int((line_dim - 1) / 2) + 1
            line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim  # 乘上卷积核个
        # 初始化全连接层
        self.line = torch.nn.Linear(line_dim * 3, class_num)

    # 一个cnn结构
    def conv_and_pool(self,input,conv):
        """
        :param input: 输入数据
        :param conv: 卷积层
        :return:
        """
        data = conv(input)          # 卷积  [batch,kernel_num,max_len,1]
        data = data.squeeze(3)      # 降维   [batch,kernel_num,max_len]
        data = self.relu(data)      # relu激活函数
        data = self.max_pool(data)  # 池化    [batch,kernel_num,max_len/2]
        #print(data.shape)
        if cnn_layer>1: # 进入深度卷积层
            for this_layer in range(len(self.conv_add)):  # 例如第二层卷积数据形状
                data = self.conv_add[this_layer](data)  # 卷积  [batch, kernel_num*2, max_len/2-1]
                data = self.relu(data)                 # relu激活函数[batch, kernel_num*2, max_len/2-1]
                data = self.max_pool(data)             # 池化 [batch, kernel_num*2, (max_len/2-1)/2]
                #print(data.shape)

        data = torch.reshape(data,shape=(data.shape[0],-1))   # 展开最后一维进行降维

        return data

    # 用上2，3，4这三个cnn
    def calls(self,input):
        """
        :param input: 输入数据
        :return:
        """
        datas = []
        # 获取三个cnn的结果
        for i in range(len(self.conv)):
            data = self.conv_and_pool(input,self.conv[i])
            datas.append(data)
        # 将结果进行拼接
        for i in range(1,len(datas)):
            datas[0] = torch.cat((datas[0],datas[i]),dim=1)

        datas = self.drop(datas[0])    # 防止过拟合
        output = self.line(datas)       # 全连接

        return output


class mymodel(torch.nn.Module):
    def __init__(self, embed_dim, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE):
        super(mymodel, self).__init__()
        self.bert = bert_torch.bert(class_num)
        self.cnn = TextCNN(embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE)
        # none表示不降维，返回和target相同形状；mean表示对一个batch的损失求均值；sum表示对一个batch的损失求和
        self.cross_entropy = torch.nn.CrossEntropyLoss()    # 定义损失函数，交叉熵损失函数
        self.optimizer = torch.optim.Adam(self.parameters(),lr=learning_rate)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.1)  # 设置学习率下降策略"

        #self.drop = torch.nn.Dropout(learning_rate)
        ## 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
        #line_dim = max_len / 2  # 由第一层池化操作得到的
        #if cnn_layer > 1:
        #    for i in range(cnn_layer - 1):  # 第二层到第cnn_layer层
        #        if i % 2 == 0:  # 偶数层刚好卷积后全部池化
        #            line_dim = int((line_dim - 1) / 2)
        #        if i % 2 == 1:  # 奇数层卷积后会剩一个没池化到，便多池化一次
        #            line_dim = int((line_dim - 1) / 2) + 1
        #    line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim  # 乘上卷积核个数
#
        ## 初始化全连接层
        #self.line = torch.nn.Linear(line_dim * 3, class_num)
#
        self.writing_mode = 'w'
        self.Train = Train

    def Training(self, data_path, verify_path, max_len, DEVICE, epoch, batch_size, class_num, save_path):

        self.train()
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        # 训练
        all_time_start = time.time()
        bast_acc = 0
        for e in range(epoch):
            this_time_start = time.time()  # 起始时间
            batch_num = datas_len // batch_size  # 可取的批数
            batch_num = 10

            all_loss = []
            all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
            all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)

            # 批训练
            for batch in range(batch_num):
                # 获取数据
                labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
                long_labels = torch.tensor(labels, dtype=torch.float32).long()

                self.optimizer.zero_grad()  # 1.梯度置零
                outputs, _ = self.bert.calls(comments, max_len, DEVICE)  # 2.模型获得结果
                outputs = outputs.unsqueeze(1)
                #print(outputs.shape)
                outputs = self.cnn.calls(outputs)
                #cls = self.drop(cls)  # 防止过拟合
                #cls = self.line(cls)  # 全连接
                #outputs = torch.softmax(outputs, dim=-1)

                loss = self.cross_entropy(outputs.to('cpu'), long_labels)  # 3.计算损失
                #loss.requires_grad_(True)
                loss.backward()  # 4.反向传播
                self.optimizer.step()  # 5.修改参数，w，b

                ## 记录遍历一遍数据的总结果
                all_loss.append(loss.item())  # item()返回loss的值
                all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
                for i in range(len(labels)):
                    if labels[i] == 0.001:
                        labels[i] = 0
                    else:
                        labels[i] = 1
                labels = torch.tensor(labels, dtype=torch.float32)
                all_labels = torch.cat((all_labels, labels), dim=0)

                ## 选择训练最好的参数保存
                #Acc = self.test(verify_path, batch_size, class_num, save_path)
                #if Acc > bast_acc:
                #   bast_acc = Acc
                #   # 保存模型
                #   torch.save(self.state_dict(), save_path + "/model.pth")
                print('\r训练进度{:2d}%， 共有{}批数据， 已完成{:2d}%， 当前损失： {:4f}， ACC: {} '.format(
                        int((e) / epoch * 100), batch_num, int((batch + 1) / batch_num * 100),loss, 'None'), end='')
            # 打印并保存本次训练结果
            if e % 1 == 0:
                torch.save(self,save_path + "/model.pth")
                this_time = time.time() - this_time_start  # 本次耗时
                all_time = time.time() - all_time_start  # 当前总耗时
                predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签（0或1）
                actual_value = all_labels[1:].detach().numpy()[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测，实际]
                mean_loss = np.array(all_loss).mean()

                acc = self.look_and_save_data(result, this_time, save_path, self.writing_mode, self.Train, step=e,
                                   loss=mean_loss, all_time=all_time)
                self.writing_mode = 'a'  # 更改写入模式为追加


    def test(self, data_path, batch_size, class_num, save_path, test_data_save=False):

        self.eval()
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        this_time_start = time.time()  # 起始时间
        batch_num = datas_len // batch_size  # 可取的批数
        all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
        all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)
        batch_num = 30

        # 批训练
        for batch in range(batch_num):
            # 获取数据
            labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
            labels = torch.tensor(labels, dtype=torch.float32)

            with torch.no_grad():  # 不进行梯度计算，节省内存
                outputs, _ = self.bert.calls(comments, max_len, DEVICE)  # 2.模型获得结果
                outputs = self.cnn.calls(outputs.unsqueeze(1))
                #cls = self.drop(cls)  # 防止过拟合
                #cls = self.line(cls)  # 全连接
                #outputs = torch.softmax(outputs, dim=-1)

            # 记录遍历一遍数据的总结果
            all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
            for i in range(len(labels)):
                if labels[i] == 0.001:
                    labels[i] = 0
                else:
                    labels[i] = 1
            labels = torch.tensor(labels, dtype=torch.float32)
            all_labels = torch.cat((all_labels, labels), dim=0)
            if test_data_save != False:
                print('\r共有{}批数据， 测试进度{:2d}% '.format(batch_num, int((batch + 1) / batch_num * 100)), end='')

        this_time = time.time() - this_time_start  # 本次耗时
        all_outputs = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签（0或1）
        all_labels = all_labels[1:].detach().numpy()[:, None]  # 实际标签
        all_outputs = np.concatenate((all_outputs, all_labels), axis=1)  # 标签拼接对比[预测，实际]
        # 计算评价指标并保存训练情况
        Acc = self.look_and_save_data(all_outputs, this_time, save_path, self.writing_mode, test_data_save=test_data_save)

        return Acc


    # 打印和保存训练过程或预测结果
    def look_and_save_data(self, result, this_time, save_path, writing_mode, Train=False, step=None, loss=None,
                           all_time=None, test_data_save=False):
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  train_data_Acc: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path + '/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | train_data_Acc: {:3f} |\n".format(
                        step, loss, all_time / 60, P, R, F1, Accuracy))
            file.close()
        ## 保存模型
        # torch.save(model.state_dict(), save_path+"/model.pth")
        else:  # 预测模式
            if test_data_save == True:
                print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
                with open(save_path + '/test_result.txt', writing_mode, encoding='utf-8') as file:
                    file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                        P, R, F1, Accuracy, this_time / 60))
                file.close()
        return Accuracy




if __name__ == '__main__':
    #tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')  # 加载base模型的对应的切词器
    #model = BertModel.from_pretrained('bert-base-chinese')
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('GPU: ', '可用' if str(DEVICE) == "cuda" else "不可用")     # 查看GPU是否可用
    print('torch版本: ', torch.__version__)                    # 查看torch版本
    print('GPU数量: ', torch.cuda.device_count())              # 查看GPU数量
    print('GPU索引号: ', torch.cuda.current_device())          # 查看GPU索引号
    print('GPU名称: ', torch.cuda.get_device_name(0))          # 根据索引号得到GPU名称

    # 获取数据集个数
    save_path = 'model_data/balanced_bert_output_CNN_in_50_3_label'
    os.makedirs(save_path, exist_ok=True)  # 创建保存文件目录
    train_path = 'data_set/douban_comment/balanced/balanced_train.txt'
    test_path = 'data_set/douban_comment/balanced/balanced_test.txt'
    verify_path = 'data_set/douban_comment/balanced/balanced_verify.txt'
    if Train == True:
        model = mymodel(word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE).to(DEVICE)
        model.Training(train_path, verify_path, max_len, DEVICE, steps, batch_size, class_num, save_path)

        # 自行测试
        Train = False
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
    else:
        model = torch.load(save_path + "/model.pth")  # 加载模型参数
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)

唯有读书高！

关注

3
点赞
踩
19

收藏

觉得还不错? 一键收藏
0
评论
NLP学习笔记——情感分析实战（情感分类）

相关知识自行了解，上代码，代码是好久之前在大学写的了，可能有点乱，工作之后没时间优化了，但模块功能绝对没问题，报错的话应该是库的不兼容产生的，我忘记原来的库版本了，python用的是3.8。一、Transformer-textCNN。一、Transformer-textCNN。二、BERT-textCNN。
复制链接

扫一扫