NLP学习笔记——情感分析实战(情感分类)

相关知识自行了解,上代码,代码是好久之前在大学写的了,可能有点乱,工作之后没时间优化了,但模块功能绝对没问题,报错的话应该是库的不兼容产生的,我忘记原来的库版本了,python用的是3.8。

all_param.py

word2vec_size = 768         # 词向量维度

max_len = 250               # 最大句子长度

batch_size = 16             # 一次训练批数

head_num = 8                # 多头个数, 必须小于词向量维度,(head_dim=word2vec_size//head_num)

transformer_layer = 1               # 编码器(解码器)层数

class_num = 2               # 分类的类别数

learning_rate = 1e-5       # 学习率

steps = 10                   # 训练次数

Train = True           # 是否选择训练模式,True为训练模式, False为预测模式

cnn_layer = 3         # CNN层数

kernel_num = 32         # 卷积核个数

一、Transformer-textCNN

data2vector.py   这里比较乱,不同数据不同处理方法,大家尽量自己写

import numpy as np
from all_param import *

def word2vec_index(file_path):
    """
    :param file_path: 词向量文件路径
    :return word2vector: 字到向量的字典
    :return word2index: 字到词袋表示的字典
    :return index2word: 词袋表示到字的字典
    """
    word2vector = {}
    word2index = {}
    index2word = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        index = 1
        data = file.readlines()[1:]
        for line in data:
            line = line.replace('\n', '')
            line = line.split(' ')
            word = line[0]
            vector = np.array(line[1:], dtype=float)
            #建立索引
            word2vector[word] = vector
            word2index[word] = index
            index2word[index] = word
            index +=1

        # 加入填充符
        word2vector['<pad>'] = np.zeros(shape=(word2vec_size))
        word2index['<pad>'] = 0
        index2word[0] = '<pad>'

    return word2vector, word2index, index2word


def data_processing(path, data_len, word2vector, word2index, data_batch, data_start_site):
    """
    :param path: 数据集路径
    :param data_len: 数据数
    :param word2vector: 转词向量字典
    :param word2index: 转词词袋表示字典
    :param data_batch: 一次取的数据数
    :param data_start_site: 开始取的数据位置
    :return comment2vector: 评论向量表示
    :return comment2index: 评论词袋表示
    :return labels: 标签(独热编码)
    """
    with open(path, 'r', encoding='utf-8') as file1:
        data = file1.readlines()
        if data_start_site + data_batch > data_len: # 选取数据下标超出列表的长度但小于所取的数据批数时
            end_site = data_start_site + data_batch - data_len   # 应取数据的末尾位置
            data = data[data_start_site:] + data[:end_site]
        else:
            end_site = data_start_site + data_batch       # 应取数据的末尾位置
            data = data[data_start_site:end_site]
    file1.close()
    #初始化向量空间和词袋空间
    comment2vector = np.zeros(shape=(len(data), max_len, word2vec_size))
    comment2index = np.zeros(shape=(len(data), max_len))
    labels = np.zeros(shape=(len(data), class_num), dtype=float)
    #遍历每一条评论
    for i in range(len(data)):
        comment = data[i][2:]   # 获取评论
        comment = comment.replace('\n', '')
        comment = comment.split(' ')
        comment = [i for i in comment if i !='']    # 去除列表里所有空元素

        for word in range(max_len):    #对评论进行数值转换
            if word > len(comment) - 1:                        #评论长度短需要填充时
                continue
            else:                                           #正常数值转换时
                comment2vector[i][word] = word2vector[comment[word]]   #向量转换
                comment2index[i][word] = word2index[comment[word]]     #词袋转换

        label = int(data[i][:1])  # 获取标签
        # 独热编码
        labels[i][label] = 1

        # 标签平滑
        for zero in range(len(labels[i])):
            if labels[i][zero] == 0:
                labels[i][zero] = 0.0000001
            else:
                labels[i][zero] = 0.9999999
    return comment2vector, comment2index, labels





if __name__ == '__main__':
    word2vector, word2index, index2word = word2vec_index(
        'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')  # 加载词向量
    # 获取数据集个数
    with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file1:
        datas_len = len(file1.readlines())
    file1.close()
    print('一共有{}条数据'.format(datas_len))

    # 分批次输入数据集
    #batch_num = datas_len // batch_size  # 可分的批次数
    batch_num = 1
    for i in range(batch_num+1):
        comment_vector, comment_index, labels = data_processing(
            'data_set/douban_comment/balanced/balanced_train.txt', datas_len,word2vector, word2index, batch_size, i * batch_size)
        print(labels)

block_transformer.py

"""
定义transformer模块
"""
from all_param import *
from tensorflow import keras
import numpy as np
import tensorflow as tf

class transformer(keras.Model):
    def __init__(self, max_len, word_dim, head_num, class_num, learning_rate, Train):
        super(transformer, self).__init__()
        self.Train = Train
        self.pe = self.positional_encoding(word_dim,max_len)                        # 位置编码
        self.head_dim = word_dim // head_num              # 分头后的维度
        # Q、K、V矩阵   kernel_initializer='RandomUniform'
        self.Wq = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        self.Wk = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        self.Wv = keras.layers.Dense(self.head_dim * head_num,kernel_initializer='RandomUniform')
        # 前馈神经网络
        self.feed_forward_network = keras.layers.Dense(word_dim * head_num,kernel_initializer='RandomUniform',
                                             activation=keras.activations.relu)
        self.adjust_shape = [keras.layers.Dense(word_dim,kernel_initializer='RandomUniform') for _ in range(2)]    # 调整多头注意力输出张量形状
        self.drop = [keras.layers.Dropout(rate=learning_rate) for _ in range(2)]      # 防止过拟合,让神经元以rate的概率停止工作
        self.layer_norm = [keras.layers.LayerNormalization(axis=-1) for _ in range(2)]  # Norm
        self.linear = keras.layers.Dense(class_num,kernel_initializer='RandomUniform')     # 初始化全连接层(linear层)

    # 位置编码
    def positional_encoding(self,word_dim,max_len):
        """
        :return pe: 位置编码
        """
        # 初始化变量pos和i
        pos = np.array([[i for i in range(max_len)]]).T
        I = np.array([[i if i%2==0 else (i-1) for i in range(word_dim)]])

        # 公式计算
        pe = pos / np.power(10000, I/word_dim)
        pe[:, 0::2] = np.sin(pe[:, 0::2])
        pe[:, 1::2] = np.cos(pe[:, 1::2])

        return pe

    # 多头注意力机制
    def multi_head_attention(self,x_embedding, x_index, this_layer):
        """
        :param x_embedding: 词向量表示
        :return output: 含注意力信息的词向量
        """
        # 公式计算
        q,k,v = self.Wq(x_embedding), self.Wk(x_embedding), self.Wv(x_embedding)
        h_q = tf.reshape(q, (q.shape[0], head_num, q.shape[1], self.head_dim))   # 分头
        h_k = tf.reshape(k, (k.shape[0], head_num, k.shape[1], self.head_dim))
        h_v = tf.reshape(v, (v.shape[0], head_num, v.shape[1], self.head_dim))
        dk = h_q.shape[-1]

        attention = tf.matmul(h_q, h_k, transpose_b=True) / np.sqrt(dk)   # 未加掩码的注意力
        attention_mask = self.mask(x_index)
        # 加入掩码
        attention += attention_mask * -1e10           # 使要遮掩的位置的注意力为负无穷大
        self.attention = tf.nn.softmax(attention, axis=-1)  # 经过softmax后需要遮掩位置的注意力为无限接近0
        att_massage = tf.matmul(self.attention, h_v)   # 获得通过注意力表示的词向量

        # 输出数据形状调整
        att_massage = tf.transpose(att_massage, perm=[0, 2, 1, 3])    #  为了方便下一步降维,将head_num和head_dim整合成word_dim
        att_massage = tf.reshape(att_massage, (att_massage.shape[0], att_massage.shape[1], -1))
        output = self.adjust_shape[0](att_massage)     # 词向量形状规范化,head_num * head_dim不一定等于word_dim
        output = self.drop[0](output, training=self.Train)

        return output

    # 多头注意力机制里的掩码
    def mask(self, x_index):
        """
        :param x_index: 词袋表示
        :return word_mask: 填充符向量掩码
        :return attention_mask:  注意力掩码
        """
        mask = tf.math.equal(x_index, np.zeros(shape=x_index.shape))  # 找到需要遮掩的元素位置,值为True
        attention_mask = mask[:, np.newaxis, np.newaxis, :]
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)  # 获得词向量填充符掩码

        return attention_mask

    # 前馈神经网络
    def feed_forward(self,attention, this_layer):
        """
        :param attention: 含注意力信息的词向量
        :return output:  调整后的词向量
        """
        # 数据输入计算
        output = self.feed_forward_network(attention)
        output = self.adjust_shape[1](output)
        output = self.drop[1](output, training=self.Train)

        return output

    # 编码器层
    def encoder_layer(self, x_embedding, x_index, this_layer):
        """
        :param x_embedding: 含位置编码的词向量表示
        :param x_index:  词袋表示
        :param this_layer: 编码器层
        :return:
        """
        x_attention = self.layer_norm[0](x_embedding)                         # Norm (layerNorm)
        x_attention = self.multi_head_attention(x_attention, x_index, this_layer)  # 多头注意力机制
        x_attention += x_embedding                                                 # Add

        x_message = self.layer_norm[1](x_attention)                         # Norm (layerNorm)
        x_message = self.feed_forward(x_message, this_layer)                     # 前馈神经网络
        x_message += x_attention                                                   # Add

        return x_message

    # 整个编码器模块
    def encoder(self, x_embedding, x_index, layer_num):
        """
        :param x_embedding: 含位置编码的词向量表示
        :param x_index:  词袋表示
        :return x_message: 编码器提取到的信息
        """
        # 各个模块组成编码器
        x_message = x_embedding
        for i in range(layer_num):                                  # encoder的个数
            x_message = self.encoder_layer(x_message, x_index, i)

        return x_message

    # 整个transformer模型
    def calls(self, x_vector, x_index, layer_num):
        """
        :param x_vector: 词向量表示
        :param x_index: 词袋表示
        :return: 预测类别的概率
        """
        # 各个模块拼接成transformer
        x_embedding = x_vector + self.pe                 # 位置编码嵌入
        scores = self.encoder(x_embedding, x_index, layer_num) # 编码器
        #scores = tf.reduce_mean(scores, axis=1)     # 降维形成句向量,去掉max_len维度
        #scores = self.linear(scores)                     # 全链接,实现类别数值的计算[batch_size,class_num]
        #scores = tf.math.softmax(scores, axis=-1)        # 获得类别概率

        return scores


if __name__ == '__main__':
    """
    测试transformer能不能正常使用
    """
    import os
    import data2vector
    import pickle
    import time

    # 训练
    def train(model, data_path, batch_size, steps, word2vector, word2index, class_num, layer_num,
              cross_entropy,optimizer,save_path,writing_mode):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集,分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            #data_copies = 10

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=((data_copies + 1) * batch_size, class_num))
            all_labels = np.zeros(shape=((data_copies + 1) * batch_size, class_num))

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    scores = model.calls(x_vector, x_index,layer_num)  # 获取模型预测值
                    loss = cross_entropy(labels, scores)  # 计算交叉熵损失
                derivative = tape.gradient(loss, model.trainable_variables)  # 自动求导
                optimizer.apply_gradients(zip(derivative, model.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies,i, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time                  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签(0或1)
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
                # 保存和打印
                look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=np.array(all_loss).mean(), all_time=all_time)
                writing_mode = 'a'

    # 测试
    def test(model, data_path, batch_size,layer_num,class_num,save_path,writing_mode):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        model.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=((batch_num) * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            scores = model.calls(x_vector, x_index,layer_num)  # 获取模型预测值
            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签(0或1)
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num+1), end='')

        times = time.time() - start_time
        look_and_save_data(model,results, times,save_path,writing_mode)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测,标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            model.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


    # 初始化交叉熵和优化器
    cross_entropy = keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam()
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/cg'
    # 模型参数初始化
    model = transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)

    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
            word2vector,word2index, class_num, transformer_layer,cross_entropy,optimizer,save_path,writing_mode)

    else:  # 测试模型
        # 模型参数初始化
        test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size, transformer_layer,
                   class_num,save_path,writing_mode)

block_CNN.py

import tensorflow as tf
from all_param import *
import numpy as np

class TextCNN(tf.keras.Model):
    def __init__(self, word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train):
        super(TextCNN, self).__init__()
        self.Train = Train
        # 初始化第一层卷积核大小分别为(2,embed_dim),(3,embed_dim),(4,embed_dim)的卷积层
        self.conv = [tf.keras.layers.Conv2D(kernel_num, (i,word2vec_size), strides=(1,1), padding='valid',
                                 kernel_initializer='RandomUniform', activation='relu') for i in range(2,5)]
        self.max_pool = tf.keras.layers.MaxPool1D(pool_size=2,padding='same')
        self.drop = tf.keras.layers.Dropout(rate=learning_rate)
        # self.line = tf.keras.layers.Dense(512, kernel_initializer='RandomUniform')  # 初始化全连接层
        self.line0 = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')  # 初始化全连接层
        self.line1 = tf.keras.layers.Dense(class_num,kernel_initializer='RandomUniform')   # 初始化全连接层

        # 后续的深层卷积层
        if cnn_layer>1:
            self.conv_add = [tf.keras.layers.Conv1D(tf.math.pow(2, i + 1) * kernel_num, 2, strides=1,
             kernel_initializer='RandomUniform', activation='relu',padding='valid') for i in range(cnn_layer - 1)]


    # 一个cnn结构
    def conv_and_pool(self,input,conv):
        """
        :param input: 输入数据
        :param conv: 卷积层
        :return:
        """
        data = conv(input)            # 卷积  [batch, max_len-1, 1, kernel_num]
        data = tf.reshape(data, (data.shape[0], data.shape[1], -1)) # 降维[batch, max_len-1, kernel_num]
        data = self.max_pool(data)    # 池化  [batch, (max_len-1)/2, kernel_num]
        if cnn_layer > 1:                                # 进入深度卷积层
            for this_layer in range(cnn_layer-1):      # 例如第二层卷积数据形状
                data = self.conv_add[this_layer](data) # 卷积  [batch, (max_len-1)/2-1, kernel_num]
                data = self.max_pool(data)             # 池化 [batch, ((max_len-1)/2-1)/2, kernel_num]

        data = tf.reshape(data, (data.shape[0], -1))   # 展开最后一维进行降维

        return data

    # 用上2,3,4这三个cnn
    def calls(self, input):
        """
        :param input: 输入数据
        :return:
        """
        datas = []
        # 获取三个cnn的结果
        for i in range(len(self.conv)):
            data = self.conv_and_pool(input,self.conv[i])
            datas.append(data)
        # 将结果进行拼接
        for i in range(1,len(datas)):
            datas[0] = tf.concat((datas[0],datas[i]),1)

        output = self.drop(datas[0],training=self.Train)    # 防止过拟合
        output = self.line0(output)
        #output = self.line1(output)     # 全连接
        #output = tf.math.softmax(output, axis=-1)  # 获得类别概率

        return output



if __name__=='__main__':
    """
    测试CNN能不能正常使用
    """
    import os
    import data2vector
    import pickle
    import time

    # 训练
    def train(model, data_path, batch_size, steps, word2vector, word2index, index2word, class_num,
              cross_entropy,optimizer,save_path,writing_mode):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集,分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            #data_copies = 80

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=(data_copies * batch_size, class_num))
            all_labels = np.zeros(shape=(data_copies * batch_size, class_num))

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    x_vector = x_vector[:,:,:,np.newaxis]
                    scores = model.calls(x_vector)  # 获取模型预测值
                    loss = cross_entropy(labels, scores)  # 计算交叉熵损失
                derivative = tape.gradient(loss, model.trainable_variables)  # 自动求导
                optimizer.apply_gradients(zip(derivative, model.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies,i+1, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time                  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签(0或1)
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
                mean_loss = np.array(all_loss).mean()                # 平均损失

                look_and_save_data(model,result, this_time, save_path,writing_mode,word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=mean_loss, all_time=all_time) # 保存和打印
                writing_mode = 'a'

    # 测试
    def test(model, data_path, batch_size,class_num,save_path,writing_mode):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        model.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=(batch_num * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            x_vector = x_vector[:, :, :, np.newaxis]
            scores = model.calls(x_vector)  # 获取模型预测值

            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签(0或1)
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i+1, batch_num), end='')

        times = time.time() - start_time
        look_and_save_data(model,results, times,save_path,writing_mode)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode,word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测,标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                                                                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            model.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


    # 初始化交叉熵和优化器
    cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam()
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/balanced_CNN_4_128'
    # 模型参数初始化
    model = TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)

    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        train(model,'data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps,
            word2vector,word2index, index2word, class_num,cross_entropy,optimizer,save_path,writing_mode)

    else:  # 测试模型
        # 模型参数初始化
        test(model,'data_set/douban_comment/balanced/balanced_test.txt', batch_size,
                   class_num,save_path,writing_mode)

transformer-textCNN.py

import os
import pickle
from all_param import *
import data2vector
import numpy as np
import tensorflow as tf
import time

import block_CNN,block_transformer

class TransformerCNN(tf.keras.Model):
    def __init__(self,max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer):
        super(TransformerCNN, self).__init__()
        self.transformer = block_transformer.transformer(max_len, word2vec_size, head_num, class_num, learning_rate, Train)
        self.CNN = block_CNN.TextCNN(word2vec_size,kernel_num,cnn_layer,learning_rate,class_num,Train)
        self.Train = Train

        # 初始化交叉熵和优化器

        self.cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
        self.optimizer = tf.keras.optimizers.Adam()
        self.tf_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
        self.cnn_linear = tf.keras.layers.Dense(word2vec_size, kernel_initializer='RandomUniform')
        self.linear = tf.keras.layers.Dense(class_num, kernel_initializer='RandomUniform')  # 初始化全连接层(linear层)
        self.writing_mode = 'w'  # 初始写入模式为覆盖
        self.attention = None

    # 训练
    def train(self, data_path, batch_size, steps, word2vector, word2index, class_num, save_path, layer_num):
        """
        :param data_path: 训练集路径
        :param batch_size: 批数
        :param steps: 训练次数
        """
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 训练轮数
        all_time = 0  # 记录训练总耗时
        for step in range(steps):
            start_time = time.time()
            # 遍历数据集,分批次输入数据集
            data_copies = datas_len // batch_size  # 可分的批次数
            # data_copies = 10

            # 用来记录每一批数据的训练结果
            all_loss = []
            all_scores = np.zeros(shape=(data_copies * batch_size, class_num),)
            all_labels = np.zeros(shape=(data_copies * batch_size, class_num), dtype=float)

            for i in range(data_copies):
                x_vector, x_index, labels = data2vector.data_processing(
                    data_path, datas_len, word2vector, word2index,
                    batch_size, i * batch_size)
                # 开始训练并计算损失
                with tf.GradientTape() as tape:
                    scores1 = self.transformer.calls(x_vector, x_index, layer_num)  # 获取transformer模型预测值
                    scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis])  # 获取cnn模型预测值
                    scores = self.linear(scores2)  # 全连接
                    scores = tf.math.softmax(scores, axis=-1)                       # 获得类别概率
                    loss = self.cross_entropy(labels, scores)                       # 计算交叉熵损失
                derivative = tape.gradient(loss, self.trainable_variables)  # 自动求导
                self.optimizer.apply_gradients(zip(derivative, self.trainable_variables))  # 更新参数

                # 记录遍历一遍数据的总结果
                all_loss.append(loss)
                all_scores[i * batch_size: (i + 1) * batch_size, :] = scores
                all_labels[i * batch_size: (i + 1) * batch_size, :] = labels
                print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(data_copies, i, loss), end='')

            # 打印并保存本次训练结果
            if step % 1 == 0:
                this_time = time.time() - start_time  # 本次耗时
                all_time += this_time  # 总耗时

                predict_value = np.argmax(all_scores, axis=-1)[:, None]  # 预测标签(0或1)
                actual_value = np.argmax(all_labels, axis=-1)[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
                mean_loss = np.array(all_loss).mean()

                self.look_and_save_data(result, this_time, save_path, word2vector=word2vector,
                                        word2index=word2index, index2word=index2word, step=step,
                                        loss=mean_loss, all_time=all_time)  # 保存和打印
                self.writing_mode = 'a'

    # 测试
    def test(self, data_path, batch_size, layer_num, class_num, save_path):
        """
        :param data_path: 测试集路径
        :param batch_size: 批数
        """
        # 加载训练好的模型
        with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "rb") as f:
            dic = pickle.load(f)
        f.close()
        word2vector = dic['word2vector']
        word2index = dic['word2idx']
        self.load_weights(save_path+"/model.ckpt")

        # 获取数据集长度
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
            print('共有{}条数据'.format(datas_len))
        file1.close()

        # 测试
        start_time = time.time()
        batch_num = datas_len // batch_size  # 需要处理的次数
        # 记录全部预测结果
        results = np.zeros(shape=(batch_num * batch_size, class_num))

        for i in range(batch_num):
            x_vector, x_index, labels = data2vector.data_processing(
                data_path, datas_len, word2vector, word2index,
                batch_size, i * batch_size)
            scores1 = self.transformer.calls(x_vector, x_index, layer_num)  # 获取transformer模型预测值
            scores2 = self.CNN.calls(scores1[:, :, :, np.newaxis])  # 获取cnn模型预测值
            scores = self.linear(scores2)  # 全连接
            predict_value = np.argmax(scores, axis=-1)[:, None]  # 预测标签(0或1)
            actual_value = np.argmax(labels, axis=-1)[:, None]  # 实际标签
            result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]

            results[i * batch_size: (i + 1) * batch_size, :] = result  # 将该批结果存入总结果
            print('\r第 {:3} 批数据,共有{}批数据'.format(i + 1, batch_num + 1), end='')

        times = time.time() - start_time
        self.look_and_save_data(results, times,save_path)

    # 打印和保存训练过程或预测结果
    def look_and_save_data(self, result, this_time, save_path, word2vector=None, word2index=None, index2word=None,
                           step=None, loss=None, all_time=None):
        """
        :param result: 预测和标签 [预测,标签]
        :param this_time: 本次耗时
        :param step: 训练次数
        :param loss: 损失值
        :param all_time: 总耗时
        """
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)
        os.makedirs(save_path, exist_ok=True)
        # 输出并保存结果
        if self.Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', self.writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f} | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            self.save_weights(save_path+"/model.ckpt")
            os.makedirs(save_path+"/tmp", exist_ok=True)
            with open(save_path+"/tmp/transformer_word2idx_idx2word.pkl", "wb") as f:
                pickle.dump({"word2vector": word2vector, "word2idx": word2index, "idx2word": index2word}, f)

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                P, R, F1, Accuracy, this_time))
            with open(save_path+'/test_result.txt', self.writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f} |\n".format(
                    P, R, F1, Accuracy, this_time))
            file.close()


if __name__ == '__main__':


    # 模型参数初始化
    model = TransformerCNN(max_len, word2vec_size, head_num, class_num, learning_rate, Train,kernel_num,cnn_layer)
    save_path = './model_data/balanced_RU_1_64_CNN_4_64_label'
    if Train == True:  # 模型训练
        word2vector, word2index, index2word = data2vector.word2vec_index(
            'word2vec/douban_comment/fen_ci128/balanced/balanced_data.vector')
        model.train('data_set/douban_comment/balanced/balanced_train.txt', batch_size, steps, word2vector, word2index,
                    class_num, save_path,transformer_layer)

    else:  # 测试模型
        # 模型参数初始化
        model.test('data_set/douban_comment/balanced/balanced_test.txt',batch_size, transformer_layer, class_num, save_path)









二、BERT-textCNN

get_data.py

import numpy as np

# 数据处理和提取
def get_input(path, data_num, data_batch, data_start_site):
    # 读取对应批数的数据
    with open(path, 'r', encoding='utf-8') as file1:
        data = file1.readlines()
        if data_start_site + data_batch > data_num:  # 选取数据下标超出列表的长度但小于所取的数据批数时
            end_site = data_start_site + data_batch - data_num  # 应取数据的末尾位置
            data = data[data_start_site:] + data[:end_site]
        else:
            end_site = data_start_site + data_batch  # 应取数据的末尾位置
            data = data[data_start_site:end_site]
    file1.close()

    labels = np.zeros(shape=(len(data)))
    comments = []
    # 数据处理
    for i in range(len(data)):
        one_data = data[i].replace('\n', '')
        one_data = one_data.split(' ')
        label, comment = int(one_data[0]) ,one_data[1:]

        if label != 0 and label != 1:  # 如果标签不存在,舍弃这条数据
            labels[i] = 0
        else:
            if label == 0:
                labels[i] = 0.001
            if label == 1:
                labels[i] = 1.001

            comments.append(''.join(comment))



    return labels, comments

if __name__ == '__main__':
    with open('data_set/douban_comment/balanced/balanced_train.txt', 'r', encoding='utf-8') as file:
        data_len = len(file.readlines())
    file.close()

    labels, comments = get_input('data_set/douban_comment/balanced/balanced_train.txt', data_len, 10, 0)
    print(labels)

bert_torch.py

from transformers import BertModel, BertTokenizer
import torch

#print(torch.cuda.is_available())        # 查看GPU是否可用
#print(torch.cuda.device_count())        # 查看GPU数量
#print(torch.cuda.current_device())      # 查看GPU索引号
#print(torch.cuda.get_device_name(0))    # 根据索引号得到GPU名称


class bert(torch.nn.Module):
    def __init__(self):
        super(bert, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')  # Bert分词器
        self.BERT = BertModel.from_pretrained('hfl/chinese-bert-wwm') # Bert模型,放GPU上

    def calls(self,input_list):
        batch_tokenized = self.tokenizer.batch_encode_plus(input_list, add_special_tokens=True,
                                                           max_length=max_len, padding='max_length',
                                                           truncation=True)

        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        #with torch.no_grad():
        hidden_outputs = self.BERT(input_ids, attention_mask=attention_mask)
        outputs = hidden_outputs[0]  # [0]表示输出结果(last_hidden_state部分),[:,0,:]表示[CLS]对应的结果
        cls = outputs[:, 0, :]
        return outputs, cls


if __name__ == '__main__':
    import get_data
    import numpy as np
    import os
    import time
    from  all_param import *

    def train(BERT, data_path, epoch, batch_size, class_num, optimizer, line, cross_entropy, save_path, writing_mode, Train):
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        # 训练
        all_time_start = time.time()
        torch.cuda.empty_cache()
        for e in range(epoch):
            this_time_start = time.time()  # 起始时间
            batch_num = datas_len // batch_size  # 可取的批数
            batch_num = 2

            all_loss = []
            all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
            all_labels = torch.tensor(np.zeros(shape=(1)), dtype=torch.float32)

            # 批训练
            for batch in range(batch_num):
                # 获取数据
                labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)

                labels = torch.tensor(labels, dtype=torch.float32).long()

                optimizer.zero_grad()  # 1.梯度置零
                _, cls = BERT.calls(comments)  # 2.模型获得结果
                cls = line(cls)
                #cls = torch.softmax(cls, dim=-1)
                loss = cross_entropy(cls, labels)  # 3.计算损失

                loss.requires_grad_(True)
                loss.backward()  # 4.反向传播
                optimizer.step()  # 5.修改参数,w,b
                print('\r共有{}批数据,第 {:3} 批数据,当前损失: {:4f} '.format(batch_num, batch, loss), end='')

                ## 记录遍历一遍数据的总结果
                all_loss.append(loss.item())  # item()返回loss的值
                all_outputs = torch.cat((all_outputs, cls), dim=0)
                all_labels = torch.cat((all_labels, labels), dim=0)

            # 打印并保存本次训练结果
            if e % 1 == 0:
                this_time = time.time() - this_time_start  # 本次耗时
                all_time = time.time() - all_time_start   # 当前总耗时
                predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签(0或1)
                actual_value = all_labels[1:].detach().numpy()[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
                look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train, step=e,
                                        loss=np.array(all_loss).mean(), all_time=all_time)
                writing_mode = 'a'  # 更改写入模式为追加

    def test(BERT, data_path, batch_size, class_num, save_path, writing_mode, Train):
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        BERT.load_state_dict(torch.load(save_path+"/model.ckpt"))
        BERT.eval()
        this_time_start = time.time()  # 起始时间
        batch_num = datas_len // batch_size  # 可取的批数
        all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
        all_labels = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)

        # 批训练
        for batch in range(batch_num):
            # 获取数据
            labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)

            labels = torch.tensor(labels, dtype=torch.float32)

            outputs, cls = BERT.call(comments)  # 2.模型获得结果
            cls = line(cls)
            cls = torch.softmax(cls, dim=-1)

            # 记录遍历一遍数据的总结果
            all_outputs = torch.cat((all_outputs, cls), dim=0)
            all_labels = torch.cat((all_labels, labels), dim=0)
            print('\r共有{}批数据, 第 {:3} 批数据'.format(batch_num, batch+1), end='')

        this_time = time.time() - this_time_start  # 本次耗时
        predict_value = np.argmax(all_outputs[1:], axis=-1)[:, None]  # 预测标签(0或1)
        actual_value = np.argmax(all_labels[1:], axis=-1)[:, None]  # 实际标签
        result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
        look_and_save_data(BERT, result, this_time, save_path, writing_mode, Train)


    # 打印和保存训练过程或预测结果
    def look_and_save_data(model, result, this_time, save_path,writing_mode, Train,
                           step=None, loss=None, all_time=None):
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)

        os.makedirs(save_path, exist_ok=True)  # 创建文件目录
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  Accuracy: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path+'/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} |\n".format(
                        step, loss, all_time / 60, P, R, F1, Accuracy))
            file.close()

            # 保存模型
            torch.save(model.state_dict(), save_path+"/model.ckpt")

        else:  # 预测模式
            print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
            with open(save_path+'/test_result.txt', writing_mode, encoding='utf-8') as file:
                file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
            file.close()

    # 初始化交叉熵和优化器
    bert = bert()
    line = torch.nn.Linear(768, class_num)
    cross_entropy = torch.nn.CrossEntropyLoss()  # 定义损失函数,交叉熵损失函数
    optimizer = torch.optim.Adam(bert.parameters(),lr=learning_rate)
    writing_mode = 'w'  # 初始写入模式为覆盖
    save_path = './model_data/cg'
    # 模型参数初始化

    if Train == True:  # 模型训练
        train(bert, 'data_set/douban_comment/balanced/balanced_train.txt', steps, batch_size, class_num,
              optimizer, line, cross_entropy, save_path, writing_mode, Train)

    else:  # 测试模型
        # 模型参数初始化
        test(bert, 'data_set/douban_comment/balanced/balanced_test.txt', batch_size, class_num,
             save_path, writing_mode, Train)



bert_textCNN.py

import os
import time
import numpy as np
import get_data
from all_param import *
import bert_torch
import torch
import math
#from transformers import BertTokenizer, BertModel

class TextCNN(torch.nn.Module):
    def __init__(self,embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE):
        super(TextCNN, self).__init__()
        # 初始化第一层卷积核大小分别为(2,embed_dim),(3,embed_dim),(4,embed_dim)的卷积层
        self.conv = [torch.nn.Conv2d(1,kernel_num,(i,embed_dim)).to(DEVICE) for i in range(2,5)]

        self.relu = torch.nn.ReLU()
        self.max_pool = torch.nn.MaxPool1d(2,ceil_mode=True)  # 最大池化层
        self.drop = torch.nn.Dropout(learning_rate)


        # 后续的深层卷积层
        if cnn_layer>1:
            self.conv_add = [torch.nn.Conv1d(int(math.pow(2, i)) * kernel_num,
                                          2 *int(math.pow(2, i)) * kernel_num, 2).to(DEVICE) for i in range(cnn_layer-1)]

        # 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
        line_dim = max_len / 2                      # 由第一层池化操作得到的
        if cnn_layer > 1:
            for i in range(cnn_layer - 1):          # 第二层到第cnn_layer层
                if i%2==0:                          # 偶数层刚好卷积后全部池化
                    line_dim = int((line_dim - 1) / 2)
                if i%2==1:                          # 奇数层卷积后会剩一个没池化到,便多池化一次
                    line_dim = int((line_dim - 1) / 2) + 1
            line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim  # 乘上卷积核个
        # 初始化全连接层
        self.line = torch.nn.Linear(line_dim * 3, class_num)

    # 一个cnn结构
    def conv_and_pool(self,input,conv):
        """
        :param input: 输入数据
        :param conv: 卷积层
        :return:
        """
        data = conv(input)          # 卷积  [batch,kernel_num,max_len,1]
        data = data.squeeze(3)      # 降维   [batch,kernel_num,max_len]
        data = self.relu(data)      # relu激活函数
        data = self.max_pool(data)  # 池化    [batch,kernel_num,max_len/2]
        #print(data.shape)
        if cnn_layer>1: # 进入深度卷积层
            for this_layer in range(len(self.conv_add)):  # 例如第二层卷积数据形状
                data = self.conv_add[this_layer](data)  # 卷积  [batch, kernel_num*2, max_len/2-1]
                data = self.relu(data)                 # relu激活函数[batch, kernel_num*2, max_len/2-1]
                data = self.max_pool(data)             # 池化 [batch, kernel_num*2, (max_len/2-1)/2]
                #print(data.shape)

        data = torch.reshape(data,shape=(data.shape[0],-1))   # 展开最后一维进行降维

        return data

    # 用上2,3,4这三个cnn
    def calls(self,input):
        """
        :param input: 输入数据
        :return:
        """
        datas = []
        # 获取三个cnn的结果
        for i in range(len(self.conv)):
            data = self.conv_and_pool(input,self.conv[i])
            datas.append(data)
        # 将结果进行拼接
        for i in range(1,len(datas)):
            datas[0] = torch.cat((datas[0],datas[i]),dim=1)

        datas = self.drop(datas[0])    # 防止过拟合
        output = self.line(datas)       # 全连接

        return output


class mymodel(torch.nn.Module):
    def __init__(self, embed_dim, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE):
        super(mymodel, self).__init__()
        self.bert = bert_torch.bert(class_num)
        self.cnn = TextCNN(embed_dim,kernel_num,cnn_layer,learning_rate,class_num,DEVICE)
        # none表示不降维,返回和target相同形状;mean表示对一个batch的损失求均值;sum表示对一个batch的损失求和
        self.cross_entropy = torch.nn.CrossEntropyLoss()    # 定义损失函数,交叉熵损失函数
        self.optimizer = torch.optim.Adam(self.parameters(),lr=learning_rate)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.1)  # 设置学习率下降策略"

        #self.drop = torch.nn.Dropout(learning_rate)
        ## 根据矩阵变化的规律求出最后得到全连接前的矩阵[batch_size,line_dim]里的dim
        #line_dim = max_len / 2  # 由第一层池化操作得到的
        #if cnn_layer > 1:
        #    for i in range(cnn_layer - 1):  # 第二层到第cnn_layer层
        #        if i % 2 == 0:  # 偶数层刚好卷积后全部池化
        #            line_dim = int((line_dim - 1) / 2)
        #        if i % 2 == 1:  # 奇数层卷积后会剩一个没池化到,便多池化一次
        #            line_dim = int((line_dim - 1) / 2) + 1
        #    line_dim = int(math.pow(2, cnn_layer - 1)) * kernel_num * line_dim  # 乘上卷积核个数
#
        ## 初始化全连接层
        #self.line = torch.nn.Linear(line_dim * 3, class_num)
#
        self.writing_mode = 'w'
        self.Train = Train

    def Training(self, data_path, verify_path, max_len, DEVICE, epoch, batch_size, class_num, save_path):

        self.train()
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        # 训练
        all_time_start = time.time()
        bast_acc = 0
        for e in range(epoch):
            this_time_start = time.time()  # 起始时间
            batch_num = datas_len // batch_size  # 可取的批数
            batch_num = 10

            all_loss = []
            all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
            all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)

            # 批训练
            for batch in range(batch_num):
                # 获取数据
                labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
                long_labels = torch.tensor(labels, dtype=torch.float32).long()

                self.optimizer.zero_grad()  # 1.梯度置零
                outputs, _ = self.bert.calls(comments, max_len, DEVICE)  # 2.模型获得结果
                outputs = outputs.unsqueeze(1)
                #print(outputs.shape)
                outputs = self.cnn.calls(outputs)
                #cls = self.drop(cls)  # 防止过拟合
                #cls = self.line(cls)  # 全连接
                #outputs = torch.softmax(outputs, dim=-1)

                loss = self.cross_entropy(outputs.to('cpu'), long_labels)  # 3.计算损失
                #loss.requires_grad_(True)
                loss.backward()  # 4.反向传播
                self.optimizer.step()  # 5.修改参数,w,b

                ## 记录遍历一遍数据的总结果
                all_loss.append(loss.item())  # item()返回loss的值
                all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
                for i in range(len(labels)):
                    if labels[i] == 0.001:
                        labels[i] = 0
                    else:
                        labels[i] = 1
                labels = torch.tensor(labels, dtype=torch.float32)
                all_labels = torch.cat((all_labels, labels), dim=0)

                ## 选择训练最好的参数保存
                #Acc = self.test(verify_path, batch_size, class_num, save_path)
                #if Acc > bast_acc:
                #   bast_acc = Acc
                #   # 保存模型
                #   torch.save(self.state_dict(), save_path + "/model.pth")
                print('\r训练进度{:2d}%, 共有{}批数据, 已完成{:2d}%, 当前损失: {:4f}, ACC: {} '.format(
                        int((e) / epoch * 100), batch_num, int((batch + 1) / batch_num * 100),loss, 'None'), end='')
            # 打印并保存本次训练结果
            if e % 1 == 0:
                torch.save(self,save_path + "/model.pth")
                this_time = time.time() - this_time_start  # 本次耗时
                all_time = time.time() - all_time_start  # 当前总耗时
                predict_value = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签(0或1)
                actual_value = all_labels[1:].detach().numpy()[:, None]  # 实际标签
                result = np.concatenate((predict_value, actual_value), axis=1)  # 标签拼接对比[预测,实际]
                mean_loss = np.array(all_loss).mean()

                acc = self.look_and_save_data(result, this_time, save_path, self.writing_mode, self.Train, step=e,
                                   loss=mean_loss, all_time=all_time)
                self.writing_mode = 'a'  # 更改写入模式为追加


    def test(self, data_path, batch_size, class_num, save_path, test_data_save=False):

        self.eval()
        # 获取数据总数
        with open(data_path, 'r', encoding='utf-8') as file1:
            datas_len = len(file1.readlines())
        file1.close()
        print('一共有{}条数据'.format(datas_len))

        this_time_start = time.time()  # 起始时间
        batch_num = datas_len // batch_size  # 可取的批数
        all_outputs = torch.tensor(np.zeros(shape=(1, class_num)), dtype=torch.float32)
        all_labels = torch.tensor(np.zeros(shape=1), dtype=torch.float32)
        batch_num = 30

        # 批训练
        for batch in range(batch_num):
            # 获取数据
            labels, comments = get_data.get_input(data_path, datas_len, batch_size, batch)
            labels = torch.tensor(labels, dtype=torch.float32)

            with torch.no_grad():  # 不进行梯度计算,节省内存
                outputs, _ = self.bert.calls(comments, max_len, DEVICE)  # 2.模型获得结果
                outputs = self.cnn.calls(outputs.unsqueeze(1))
                #cls = self.drop(cls)  # 防止过拟合
                #cls = self.line(cls)  # 全连接
                #outputs = torch.softmax(outputs, dim=-1)

            # 记录遍历一遍数据的总结果
            all_outputs = torch.cat((all_outputs, outputs.to('cpu')), dim=0)
            for i in range(len(labels)):
                if labels[i] == 0.001:
                    labels[i] = 0
                else:
                    labels[i] = 1
            labels = torch.tensor(labels, dtype=torch.float32)
            all_labels = torch.cat((all_labels, labels), dim=0)
            if test_data_save != False:
                print('\r共有{}批数据, 测试进度{:2d}% '.format(batch_num, int((batch + 1) / batch_num * 100)), end='')

        this_time = time.time() - this_time_start  # 本次耗时
        all_outputs = np.argmax(all_outputs[1:].detach().numpy(), axis=-1)[:, None]  # 预测标签(0或1)
        all_labels = all_labels[1:].detach().numpy()[:, None]  # 实际标签
        all_outputs = np.concatenate((all_outputs, all_labels), axis=1)  # 标签拼接对比[预测,实际]
        # 计算评价指标并保存训练情况
        Acc = self.look_and_save_data(all_outputs, this_time, save_path, self.writing_mode, test_data_save=test_data_save)

        return Acc


    # 打印和保存训练过程或预测结果
    def look_and_save_data(self, result, this_time, save_path, writing_mode, Train=False, step=None, loss=None,
                           all_time=None, test_data_save=False):
        # 计算P、R、F1、Accuracy
        TP = len([i for i in result if i.sum() == 2])
        TN = len([i for i in result if i.sum() == 0])
        FP = len([i for i in result if (i[0] - i[1]) == 1])
        FN = len([i for i in result if (i[0] - i[1]) == -1])
        P = (TP + 0.0001) / (TP + FP + 0.0001)
        R = (TP + 0.0001) / (TP + FN + 0.0001)
        F1 = (2 * P * R + 0.00001) / (P + R + 0.00001)
        Accuracy = (TP + TN) / len(result)
        # 输出并保存结果
        if Train == True:  # 训练模式
            # 打印并保存训练过程
            print("\tstep: {:3}  |  mean_loss: {:3f}  |  time: {:3f}m  |  train_data_Acc: {:3f}  |".format(
                step, loss, this_time / 60, Accuracy))
            # 保存训练过程的数据
            with open(save_path + '/train_process.txt', writing_mode, encoding='utf-8') as file:
                file.write(
                    "step: {:3} | mean_loss: {:3f} | time: {:3f}m | P: {:3f} | R: {:3f} | F1: {:3f} | train_data_Acc: {:3f} |\n".format(
                        step, loss, all_time / 60, P, R, F1, Accuracy))
            file.close()
        ## 保存模型
        # torch.save(model.state_dict(), save_path+"/model.pth")
        else:  # 预测模式
            if test_data_save == True:
                print("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                    P, R, F1, Accuracy, this_time / 60))
                with open(save_path + '/test_result.txt', writing_mode, encoding='utf-8') as file:
                    file.write("P: {:3f} | R: {:3f} | F1: {:3f} | Accuracy: {:3f} | time: {:3f}m |\n".format(
                        P, R, F1, Accuracy, this_time / 60))
                file.close()
        return Accuracy




if __name__ == '__main__':
    #tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')  # 加载base模型的对应的切词器
    #model = BertModel.from_pretrained('bert-base-chinese')
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('GPU: ', '可用' if str(DEVICE) == "cuda" else "不可用")     # 查看GPU是否可用
    print('torch版本: ', torch.__version__)                    # 查看torch版本
    print('GPU数量: ', torch.cuda.device_count())              # 查看GPU数量
    print('GPU索引号: ', torch.cuda.current_device())          # 查看GPU索引号
    print('GPU名称: ', torch.cuda.get_device_name(0))          # 根据索引号得到GPU名称

    # 获取数据集个数
    save_path = 'model_data/balanced_bert_output_CNN_in_50_3_label'
    os.makedirs(save_path, exist_ok=True)  # 创建保存文件目录
    train_path = 'data_set/douban_comment/balanced/balanced_train.txt'
    test_path = 'data_set/douban_comment/balanced/balanced_test.txt'
    verify_path = 'data_set/douban_comment/balanced/balanced_verify.txt'
    if Train == True:
        model = mymodel(word2vec_size, kernel_num, cnn_layer, learning_rate, class_num, Train, DEVICE).to(DEVICE)
        model.Training(train_path, verify_path, max_len, DEVICE, steps, batch_size, class_num, save_path)

        # 自行测试
        Train = False
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)
    else:
        model = torch.load(save_path + "/model.pth")  # 加载模型参数
        model.test(test_path, batch_size, class_num, save_path, test_data_save=True)


















  • 3
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处理中文情感分类源代码自然语言处
嗨!对于BERT情感分类实战,您可以按照以下步骤进行: 1. 数据准备: - 收集和整理情感分类的训练数据集,一般包含文本和对应的情感类别(如正面、负面、中性等)。 - 划分数据集为训练集和测试集,确保数据集的均衡性和随机性。 2. BERT模型介绍: - BERT(Bidirectional Encoder Representations from Transformers)是一种预训练的自然语言处理模型,通过双向Transformer编码器学习语义表示。 - 可以选择使用基于TensorFlow或PyTorch实现的BERT模型,或者直接使用已经训练好的BERT模型进行微调。 3. 模型微调: - 使用训练集对BERT模型进行微调,即在预训练的BERT模型基础上,通过训练集进行进一步的学习。 - 这里的微调过程包括输入数据的预处理、构建分类任务的模型结构、定义损失函数和优化算法等。 4. 模型评估: - 使用测试集对训练好的BERT模型进行评估,计算分类准确率、精确率、召回率等指标,评估模型在情感分类任务上的性能。 5. 预测与应用: - 使用训练好的BERT模型对新的文本进行情感分类预测,得到情感类别的预测结果。 - 可以将该模型应用于各种情感分析任务,如舆情监测、评论情感分析等。 以上是BERT情感分类实战的一般流程,您可以根据具体需求和数据特点进行相应调整和优化。希望能对您有所帮助!如有更多问题,请随时提问。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值