小黑textCNN代码笔记

核心代码:
1.构建模型

import time
import os
import pickle as pkl
from tqdm import tqdm
from datetime import timedelta
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Dense,Embedding,Conv1D,Dropout,Flatten,MaxPooling1D,Input,concatenate
import numpy as np
# 数据的导入与参数定义
embedding = 'embedding_SougouNews.npz'    # 数据集
model_name = 'TextCNN'    # 模型名称
MAX_VOCAB_SIZE = 10000    # 词表长度限制
UNK,PAD = '<UNK>','<PAD>'    # 未知字,padding符号
train_path =  './data/train.txt'    # 训练集
dev_path = './data/dev.txt'    # 验证集
test_path =  './data/test.txt'    # 测试集
class_list = [x.strip() for x in open('./data/class.txt').readlines()]
vocab_path = './data/vocab.pkl'    # 词表
save_path = './saved_dict/' + model_name + '.ckpt'    # 模型训练结果
log_path =  './log/' + model_name
embedding_pretrained = tf.convert_to_tensor(np.load( './data/' + embedding)['embeddings'].astype('float32') if embedding != 'random' else None)
dropout = 0.5    # 随机失活
num_classes = len(class_list)    # 类别数
n_vocab = 0    # 词表大小初始化
num_epochs = 20    # epoch数
batch_size = 128    # mini-batch大小
max_len = 32    # 每句话处理成的长度
learning_rate = 1e-3
embed = embedding_pretrained.shape[1] if embedding_pretrained is not None else 300    # 从词向量文件中导入词嵌入维度
num_filters = 256    # 通道数
# 创建模型
input=(max_len,)    # Embedding的input规定
main_input = Input(shape = input,dtype = 'float64')
if embedding_pretrained is not None:    # 构建embedding
    embedding = Embedding(input_dim = embedding_pretrained.shape[0],    # 词个数
                          output_dim = embedding_pretrained.shape[1],    # 次向量维度
                          input_length = max_len,   
                          weights = [embedding_pretrained],    # 加载权重
                          trainable = False    # 不可训练
    )
else:
    embedding = Embedding(nvocab,embed,input_length = max_len)
# 搭建神经网络
emb = embedding(main_input)    
cnn1 = Conv1D(num_filters,3,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn1 = MaxPooling1D(pool_size = 2)(cnn1)
cnn2 = Conv1D(num_filters,4,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn2 = MaxPooling1D(pool_size = 2)(cnn2)
cnn3 = Conv1D(num_filters,5,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn3 = MaxPooling1D(pool_size = 2)(cnn3)
# 合并三个模型的输出变量
cnn = concatenate([cnn1,cnn2,cnn3],axis = -1)
flat = Flatten()(cnn)
drop = Dropout(dropout)(flat)
main_output = Dense(num_classes,activation = 'softmax')(drop)
model = tf.keras.Model(inputs = main_input,outputs = main_output)
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()

模型架构:
在这里插入图片描述
2.制作数据

# 构建数据
tokenizer = lambda x:[y for y in x]    # char-level
vocab = pkl.load(open(vocab_path,'rb'))
print(f'Vocab size:{len(vocab)}')
def load_dataset(path):
    contents = []
    with open(path,'r',encoding = 'UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content,label = lin.split('\t')
            words_line = []
            token = tokenizer(content)
            seq_len = len(token)
            for word in token:
                words_line.append(vocab.get(word,vocab.get(UNK)))
            contents.append((words_line,int(label),seq_len))
        # [([14, 125, 55, 45, 35, 307, 4, 81, 161, 941, 258, 494, 2, 175, 48, 145, 97, 17], 3, 18),..
    return contents
start_time = time.time()
train_data =  load_dataset(train_path)
dev_data = load_dataset(dev_path)
test_data = load_dataset(test_path)
end_time = time.time()
time_dif = end_time - start_time
print('time use:',timedelta(seconds = int(round(time_dif))))
def build_net_data(dataset):    # 返回神经网络所理解的数据,将类别进行one-hot
    data = [x[0] for x in dataset]
    data_x = pad_sequences(data,maxlen = max_len)
    label_y = [x[1] for x in dataset]
    label_y = tf.keras.utils.to_categorical(label_y,num_classes = num_classes)
    return data_x,label_y
train_x, train_y = build_net_data(train_data)
dev_x,dev_y = build_net_data(dev_data)
test_x,test_y = build_net_data(test_data)

3.训练模型

# 训练模型
model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(
    x = train_x,
    y = train_y,
    validation_data = (dev_x,dev_y),
    batch_size = 512,
    epochs = 1
)
#model.save_weights(config.save_path)

4.模型预测与保存

# 模型保存
model.save('test_model.h5')
test_model = tf.keras.models.load_model('test_model.h5')
test_model.evaluate(test_x,test_y,verbose=2)

6.封装后代码
(1) model/TextCNN.py

import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Conv1D, Dropout, Flatten, MaxPooling1D, Input,concatenate
import numpy as np

class Config(object):

    """配置参数"""
    def __init__(self, dataset, embedding):
        self.model_name = 'TextCNN'
        self.train_path = dataset + '/data/train.txt'                                # 训练集
        self.dev_path = dataset + '/data/dev.txt'                                    # 验证集
        self.test_path = dataset + '/data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt').readlines()]                                # 类别名单
        self.vocab_path = dataset + '/data/vocab.pkl'                                # 词表
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.log_path = dataset + '/log/' + self.model_name
        self.embedding_pretrained = tf.convert_to_tensor(
            np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 预训练词向量

        self.dropout = 0.5                                              # 随机失活
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小,在运行时赋值
        self.num_epochs = 20                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.max_len = 32                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.shape[1]\
            if self.embedding_pretrained is not None else 300           # 字向量维度
        self.num_filters = 256                                          # 卷积核数量(channels数)


'''Convolutional Neural Networks for Sentence Classification'''

class CnnModel(tf.keras.Model):
    def __init__(self, config):
        super().__init__()
        self.config = config


    def createModel(self, input):
        main_input = Input(shape=input, dtype='float64')
        if self.config.embedding_pretrained is not None:
            self.embedding = Embedding(input_dim=self.config.embedding_pretrained.shape[0], output_dim=self.config.embedding_pretrained.shape[1],
                                       input_length=self.config.max_len,weights=[self.config.embedding_pretrained],trainable=False)
        else:
            self.embedding = Embedding(self.config.n_vocab, self.config.embed, input_length=self.config.max_len)

        # embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
        embed = self.embedding(main_input)
        # 词窗大小分别为3,4,5
        cnn1 = Conv1D(self.config.num_filters, 3, padding='same', strides=1, activation='relu')(embed)
        cnn1 = MaxPooling1D(pool_size=2)(cnn1)
        cnn2 = Conv1D(self.config.num_filters, 4, padding='same', strides=1, activation='relu')(embed)
        cnn2 = MaxPooling1D(pool_size=2)(cnn2)
        cnn3 = Conv1D(self.config.num_filters, 5, padding='same', strides=1, activation='relu')(embed)
        cnn3 = MaxPooling1D(pool_size=2)(cnn3)
        # 合并三个模型的输出向量
        cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
        flat = Flatten()(cnn)
        drop = Dropout(self.config.dropout)(flat)
        main_output = Dense(self.config.num_classes, activation='softmax')(drop)
        model = tf.keras.Model(inputs=main_input, outputs=main_output)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

(2) util.py

import os
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import timedelta


MAX_VOCAB_SIZE = 10000  # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号


def build_vocab(file_path, tokenizer, max_size, min_freq):
    vocab_dic = {}
    with open(file_path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content = lin.split('\t')[0]
            for word in tokenizer(content):
                vocab_dic[word] = vocab_dic.get(word, 0) + 1
        vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
        vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
        vocab_dic.update({UNK: len(vocab_dic)})
    return vocab_dic


def build_dataset(config, ues_word):
    if ues_word:
        tokenizer = lambda x: x.split(' ')  # 以空格隔开,word-level
    else:
        tokenizer = lambda x: [y for y in x]  # char-level
    if os.path.exists(config.vocab_path):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:
        vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(vocab, open(config.vocab_path, 'wb'))
    print(f"Vocab size: {len(vocab)}")

    def load_dataset(path):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
                words_line = []
                token = tokenizer(content)
                seq_len = len(token)
                # word to id
                for word in token:
                    words_line.append(vocab.get(word, vocab.get(UNK)))
                contents.append((words_line, int(label), seq_len))
        return contents  # [([...], 0), ([...], 1), ...]
    train = load_dataset(config.train_path)
    dev = load_dataset(config.dev_path)
    test = load_dataset(config.test_path)
    return vocab, train, dev, test

def build_net_data(dataset, config):
    data = [x[0] for x in dataset]
    data_x = pad_sequences(data, maxlen=config.max_len)
    label_y = [x[1] for x in dataset]
    label_y = tf.keras.utils.to_categorical(label_y, num_classes=config.num_classes)
    return data_x, label_y


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


if __name__ == "__main__":
    '''提取预训练词向量'''
    # 下面的目录、文件名按需更改。
    train_dir = "./BruceNews/data/train.txt"
    vocab_dir = "./BruceNews/data/vocab.pkl"
    pretrain_dir = "./BruceNews/data/sgns.sogou.char"
    emb_dim = 300
    filename_trimmed_dir = "./BruceNews/data/embedding_SougouNews"
    if os.path.exists(vocab_dir):
        word_to_id = pkl.load(open(vocab_dir, 'rb'))
    else:
        # tokenizer = lambda x: x.split(' ')  # 以词为单位构建词表(数据集中词之间以空格隔开)
        tokenizer = lambda x: [y for y in x]  # 以字为单位构建词表
        word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
        pkl.dump(word_to_id, open(vocab_dir, 'wb'))

    embeddings = np.random.rand(len(word_to_id), emb_dim)
    f = open(pretrain_dir, "r", encoding='UTF-8')
    for i, line in enumerate(f.readlines()):
        # if i == 0:  # 若第一行是标题,则跳过
        #     continue
        lin = line.strip().split(" ")
        if lin[0] in word_to_id:
            idx = word_to_id[lin[0]]
            emb = [float(x) for x in lin[1:301]]
            embeddings[idx] = np.asarray(emb, dtype='float32')
    f.close()
    np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)

(3) predict.py

import tensorflow as tf
import time
from tensorflow.keras.models import load_model
from importlib import import_module
from utils import build_dataset, get_time_dif, build_net_data
import argparse


parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model',default="TextCNN", type=str, help='choose a model: TextCNN, TextRNN')
parser.add_argument('--embedding', default='pre_trained', type=str, help='random or pre_trained')
parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
args = parser.parse_args()


if __name__ == '__main__':
    dataset = 'BruceNews'  # 数据集

    # 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
    embedding = 'embedding_SougouNews.npz'
    if args.embedding == 'random':
        embedding = 'random'
    model_name = args.model  # 'TextRCNN'  # TextCNN

    x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。
    config = x.Config(dataset, embedding) #进入到对应模型的__init__方法进行参数初始化
    start_time = time.time()
    print("Loading data...")
    vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    test_x, test_y = build_net_data(test_data, config)
    # train
    config.n_vocab = len(vocab)
    model = x.CnnModel(config)

    model = model.createModel(input=(config.max_len,))
    model.summary()
    model.load_weights(config.save_path)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # 评估模型
    score = model.evaluate(test_x, test_y, verbose=2)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值