核心代码:
1.构建模型
import time
import os
import pickle as pkl
from tqdm import tqdm
from datetime import timedelta
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Dense,Embedding,Conv1D,Dropout,Flatten,MaxPooling1D,Input,concatenate
import numpy as np
# 数据的导入与参数定义
embedding = 'embedding_SougouNews.npz' # 数据集
model_name = 'TextCNN' # 模型名称
MAX_VOCAB_SIZE = 10000 # 词表长度限制
UNK,PAD = '<UNK>','<PAD>' # 未知字,padding符号
train_path = './data/train.txt' # 训练集
dev_path = './data/dev.txt' # 验证集
test_path = './data/test.txt' # 测试集
class_list = [x.strip() for x in open('./data/class.txt').readlines()]
vocab_path = './data/vocab.pkl' # 词表
save_path = './saved_dict/' + model_name + '.ckpt' # 模型训练结果
log_path = './log/' + model_name
embedding_pretrained = tf.convert_to_tensor(np.load( './data/' + embedding)['embeddings'].astype('float32') if embedding != 'random' else None)
dropout = 0.5 # 随机失活
num_classes = len(class_list) # 类别数
n_vocab = 0 # 词表大小初始化
num_epochs = 20 # epoch数
batch_size = 128 # mini-batch大小
max_len = 32 # 每句话处理成的长度
learning_rate = 1e-3
embed = embedding_pretrained.shape[1] if embedding_pretrained is not None else 300 # 从词向量文件中导入词嵌入维度
num_filters = 256 # 通道数
# 创建模型
input=(max_len,) # Embedding的input规定
main_input = Input(shape = input,dtype = 'float64')
if embedding_pretrained is not None: # 构建embedding
embedding = Embedding(input_dim = embedding_pretrained.shape[0], # 词个数
output_dim = embedding_pretrained.shape[1], # 次向量维度
input_length = max_len,
weights = [embedding_pretrained], # 加载权重
trainable = False # 不可训练
)
else:
embedding = Embedding(nvocab,embed,input_length = max_len)
# 搭建神经网络
emb = embedding(main_input)
cnn1 = Conv1D(num_filters,3,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn1 = MaxPooling1D(pool_size = 2)(cnn1)
cnn2 = Conv1D(num_filters,4,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn2 = MaxPooling1D(pool_size = 2)(cnn2)
cnn3 = Conv1D(num_filters,5,padding = 'same',strides = 1,activation = 'relu')(emb)
cnn3 = MaxPooling1D(pool_size = 2)(cnn3)
# 合并三个模型的输出变量
cnn = concatenate([cnn1,cnn2,cnn3],axis = -1)
flat = Flatten()(cnn)
drop = Dropout(dropout)(flat)
main_output = Dense(num_classes,activation = 'softmax')(drop)
model = tf.keras.Model(inputs = main_input,outputs = main_output)
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()
模型架构:
2.制作数据
# 构建数据
tokenizer = lambda x:[y for y in x] # char-level
vocab = pkl.load(open(vocab_path,'rb'))
print(f'Vocab size:{len(vocab)}')
def load_dataset(path):
contents = []
with open(path,'r',encoding = 'UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content,label = lin.split('\t')
words_line = []
token = tokenizer(content)
seq_len = len(token)
for word in token:
words_line.append(vocab.get(word,vocab.get(UNK)))
contents.append((words_line,int(label),seq_len))
# [([14, 125, 55, 45, 35, 307, 4, 81, 161, 941, 258, 494, 2, 175, 48, 145, 97, 17], 3, 18),..
return contents
start_time = time.time()
train_data = load_dataset(train_path)
dev_data = load_dataset(dev_path)
test_data = load_dataset(test_path)
end_time = time.time()
time_dif = end_time - start_time
print('time use:',timedelta(seconds = int(round(time_dif))))
def build_net_data(dataset): # 返回神经网络所理解的数据,将类别进行one-hot
data = [x[0] for x in dataset]
data_x = pad_sequences(data,maxlen = max_len)
label_y = [x[1] for x in dataset]
label_y = tf.keras.utils.to_categorical(label_y,num_classes = num_classes)
return data_x,label_y
train_x, train_y = build_net_data(train_data)
dev_x,dev_y = build_net_data(dev_data)
test_x,test_y = build_net_data(test_data)
3.训练模型
# 训练模型
model.compile(loss = 'categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(
x = train_x,
y = train_y,
validation_data = (dev_x,dev_y),
batch_size = 512,
epochs = 1
)
#model.save_weights(config.save_path)
4.模型预测与保存
# 模型保存
model.save('test_model.h5')
test_model = tf.keras.models.load_model('test_model.h5')
test_model.evaluate(test_x,test_y,verbose=2)
6.封装后代码
(1) model/TextCNN.py
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Conv1D, Dropout, Flatten, MaxPooling1D, Input,concatenate
import numpy as np
class Config(object):
"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'TextCNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = tf.convert_to_tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None # 预训练词向量
self.dropout = 0.5 # 随机失活
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 20 # epoch数
self.batch_size = 128 # mini-batch大小
self.max_len = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.shape[1]\
if self.embedding_pretrained is not None else 300 # 字向量维度
self.num_filters = 256 # 卷积核数量(channels数)
'''Convolutional Neural Networks for Sentence Classification'''
class CnnModel(tf.keras.Model):
def __init__(self, config):
super().__init__()
self.config = config
def createModel(self, input):
main_input = Input(shape=input, dtype='float64')
if self.config.embedding_pretrained is not None:
self.embedding = Embedding(input_dim=self.config.embedding_pretrained.shape[0], output_dim=self.config.embedding_pretrained.shape[1],
input_length=self.config.max_len,weights=[self.config.embedding_pretrained],trainable=False)
else:
self.embedding = Embedding(self.config.n_vocab, self.config.embed, input_length=self.config.max_len)
# embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)
embed = self.embedding(main_input)
# 词窗大小分别为3,4,5
cnn1 = Conv1D(self.config.num_filters, 3, padding='same', strides=1, activation='relu')(embed)
cnn1 = MaxPooling1D(pool_size=2)(cnn1)
cnn2 = Conv1D(self.config.num_filters, 4, padding='same', strides=1, activation='relu')(embed)
cnn2 = MaxPooling1D(pool_size=2)(cnn2)
cnn3 = Conv1D(self.config.num_filters, 5, padding='same', strides=1, activation='relu')(embed)
cnn3 = MaxPooling1D(pool_size=2)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(self.config.dropout)(flat)
main_output = Dense(self.config.num_classes, activation='softmax')(drop)
model = tf.keras.Model(inputs=main_input, outputs=main_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
(2) util.py
import os
import tensorflow as tf
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import timedelta
MAX_VOCAB_SIZE = 10000 # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
def build_vocab(file_path, tokenizer, max_size, min_freq):
vocab_dic = {}
with open(file_path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content = lin.split('\t')[0]
for word in tokenizer(content):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({UNK: len(vocab_dic)})
return vocab_dic
def build_dataset(config, ues_word):
if ues_word:
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
else:
tokenizer = lambda x: [y for y in x] # char-level
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
print(f"Vocab size: {len(vocab)}")
def load_dataset(path):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
words_line = []
token = tokenizer(content)
seq_len = len(token)
# word to id
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
contents.append((words_line, int(label), seq_len))
return contents # [([...], 0), ([...], 1), ...]
train = load_dataset(config.train_path)
dev = load_dataset(config.dev_path)
test = load_dataset(config.test_path)
return vocab, train, dev, test
def build_net_data(dataset, config):
data = [x[0] for x in dataset]
data_x = pad_sequences(data, maxlen=config.max_len)
label_y = [x[1] for x in dataset]
label_y = tf.keras.utils.to_categorical(label_y, num_classes=config.num_classes)
return data_x, label_y
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
if __name__ == "__main__":
'''提取预训练词向量'''
# 下面的目录、文件名按需更改。
train_dir = "./BruceNews/data/train.txt"
vocab_dir = "./BruceNews/data/vocab.pkl"
pretrain_dir = "./BruceNews/data/sgns.sogou.char"
emb_dim = 300
filename_trimmed_dir = "./BruceNews/data/embedding_SougouNews"
if os.path.exists(vocab_dir):
word_to_id = pkl.load(open(vocab_dir, 'rb'))
else:
# tokenizer = lambda x: x.split(' ') # 以词为单位构建词表(数据集中词之间以空格隔开)
tokenizer = lambda x: [y for y in x] # 以字为单位构建词表
word_to_id = build_vocab(train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(word_to_id, open(vocab_dir, 'wb'))
embeddings = np.random.rand(len(word_to_id), emb_dim)
f = open(pretrain_dir, "r", encoding='UTF-8')
for i, line in enumerate(f.readlines()):
# if i == 0: # 若第一行是标题,则跳过
# continue
lin = line.strip().split(" ")
if lin[0] in word_to_id:
idx = word_to_id[lin[0]]
emb = [float(x) for x in lin[1:301]]
embeddings[idx] = np.asarray(emb, dtype='float32')
f.close()
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)
(3) predict.py
import tensorflow as tf
import time
from tensorflow.keras.models import load_model
from importlib import import_module
from utils import build_dataset, get_time_dif, build_net_data
import argparse
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--model',default="TextCNN", type=str, help='choose a model: TextCNN, TextRNN')
parser.add_argument('--embedding', default='pre_trained', type=str, help='random or pre_trained')
parser.add_argument('--word', default=False, type=bool, help='True for word, False for char')
args = parser.parse_args()
if __name__ == '__main__':
dataset = 'BruceNews' # 数据集
# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz, 随机初始化:random
embedding = 'embedding_SougouNews.npz'
if args.embedding == 'random':
embedding = 'random'
model_name = args.model # 'TextRCNN' # TextCNN
x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。
config = x.Config(dataset, embedding) #进入到对应模型的__init__方法进行参数初始化
start_time = time.time()
print("Loading data...")
vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
test_x, test_y = build_net_data(test_data, config)
# train
config.n_vocab = len(vocab)
model = x.CnnModel(config)
model = model.createModel(input=(config.max_len,))
model.summary()
model.load_weights(config.save_path)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# 评估模型
score = model.evaluate(test_x, test_y, verbose=2)
print('Test score:', score[0])
print('Test accuracy:', score[1])