main代码
# 导入库
from collections import OrderedDict
import jieba
import torch
from torch.utils.data import DataLoader
from data_processing import load_data, build_vocab, build_dataset, build_word_embedding
from dataset import MLDataset
from sklearn.model_selection import train_test_split
from importlib import import_module
import pickle
import numpy as np
from utils import train, save_config
import argparse
parser = argparse.ArgumentParser(description='Chinese Text Classification')
parser.add_argument('--embedding', default='pre_trained', type=str, help='random or pre_trained')
parser.add_argument('--num_epochs', default=1, type=int, help=' num of epochs')
parser.add_argument('--word', action='store_true', help='True for word, False for char')
args = parser.parse_args()
# 保存训练参数
def create_d_config(config):
d_config = OrderedDict()
d_config['data_path'] = config.data_path
d_config['vocab_file'] = config.vocab_file
d_config['model_name'] = config.model_name
d_config['learning_rate'] = config.learning_rate
d_config['batch_size'] = config.batch_size
d_config['embedding_size'] = config.embedding_size
d_config['num_classes'] = config.num_classes
d_config['dropout'] = config.dropout
d_config['num_filters'] = config.num_filters
d_config['max_vocab_size'] = config.max_vocab_size
d_config['min_freq'] = config.min_freq
d_config['log_path'] = config.log_path
d_config['ckpt_path'] = config.save_path
d_config['vocab_size'] = config.vocab_size
d_config['max_seq_len'] = config.max_seq_len
d_config['class_list'] = config.class_list
return dict(d_config)
# 项目代码入口
def main(num_epochs,embedding,model_name,dataset,word):
# 1. 配置参数
# batch_size = 32
# num_workers = 2
# max_size = 10000
# max_seq_len = 15
x = import_module("models." + model_name)
config = x.Config(dataset,embedding)
# 2. 数据预处理
data_path = config.data_path
if word:
tokenizer = lambda x: jieba.lcut(x) # 中文我们按照切词处理
else:
tokenizer = lambda x: [y for y in x]
data, labels, max_seq_len = load_data(data_path, tokenizer)
# 3. 基于所有数据构建词典
vocab_size, word_freqs, dict_word2index, dict_index2word = build_vocab(data, config.max_vocab_size, min_freq=config.min_freq)
config.vocab_size = vocab_size
## 保存训练参数文件
d_config = create_d_config(config)
save_config(d_config, 'config_file')
##保存我们的词典文件
with open(config.vocab_file,'wb') as f:
pickle.dump(dict_word2index,f)
# 4. 基于词典+数据离散化
datasets, labels = build_dataset(data, labels, dict_word2index, max_seq_len)
# 5. 数据特征处理(train,dev,test)
# 5.1 使用sklearn中的切分
X_train, X_val, y_train, y_val = train_test_split(datasets, labels, test_size=0.3, random_state=42)
print('len(X_train) = ', len(X_train))
print('len(X_val) = ', len(X_val))
print('*' * 60)
print('X_train = ', X_train[:2])
print('y_train = ', y_train[:2])
#5.2 自定义dataset
train_dataset = MLDataset(X_train, y_train)
val_dataset = MLDataset(X_val, y_val)
## 5.3 dataloader
train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch_size,
shuffle=True, num_workers=config.num_workers)
val_loader = DataLoader(dataset=val_dataset, batch_size=config.batch_size,
shuffle=False, num_workers=config.num_workers)
# 6. 模型训练
## 6.0 TODO 构建词向量
if embedding != 'random':
build_word_embedding(dataset, dict_word2index)
## 6.1 获取模型
model = x.Model(config)
model.to(config.device) # 模型运行设备(cuda/cpu)
print("models = ",model)
## 6.2 执行模型训练
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True
train(num_epochs,config,model,train_loader,val_loader)
# 入口程序
if __name__ == '__main__':
embedding = args.embedding # 'embedding_SougouNews.npz' #'random'
model_name = 'TextCNN'
num_epochs = args.num_epochs
dataset = 'data/news/'
main(num_epochs, embedding, model_name, dataset, args.word)
data_processing代码
'''
数据预处理功能
'''
from tqdm import tqdm
import numpy as np
from collections import Counter
def load_data(data_path,tokenizer):
"""
载入数据
:param data_path: 数据路径
:param tokenizer: 分词器
按照字进行;按照词进行
:return:
(1) 输入数据
男子因家庭积怨杀死3名亲人重伤1人 5
男子因家庭积 3
(2) 输出数据
句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
标签格式: [5,3]
"""
data = []
labels = []
total_examples = 0
max_seq_len = 0
with open(data_path,'r',encoding='utf-8') as f:
for line in tqdm(f):
splits = line.split('\t')
if len(splits)==2:# text label
words = tokenizer(splits[0].strip())
label = int( splits[1].strip() )
data.append(words)
labels.append(label)
total_examples = total_examples+1
max_seq_len = max(max_seq_len,len(words))
# 打印几条样本数据
print('total_examples = ',total_examples)
print('label_count = ',len(np.unique(labels)))
print('sentences example = ',data[:2])
print('labels example = ',labels[:2])
print('max_seq_len = ',max_seq_len)
return data,labels,max_seq_len
def build_vocab(data, max_size, min_freq=3):
"""
基于所有数据构建词表
:param data:
句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
:param max_size:最大词典的数量
:param min_freq:
:return:
word_freqs: 词频-每个单词出现的次数 [ ('<UNK>',-1),('<PAD>',0),('创业',4),('风景',3) ......]
dict_word2index: 每个单词对应的索引位置{ '<UNK>':0,'<PAD>':1,'创业':2,.... }
dit_index2word: 每个位置对应的单词{ 0: '<UNK>',1:'<PAD>',2:'创业'.....}
"""
word_freqs = [('<UNK>', -1), ('<PAD>', -1)]
words = []
for line in data:
words.extend(line) # extend 和 append的区别: extend 每个单词放入list
counter = Counter(words)
# 词频从大到小进行排序
counter_list = counter.most_common()[:max_size]
for word, freq in counter_list:
if freq >= min_freq:
word_freqs.append((word, freq))
# 构建word2index,index2word
dict_word2index = dict()
for word, freq in word_freqs:
dict_word2index[word] = len(dict_word2index)
dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
vocab_size = len(word_freqs)
print('vocab_size = ', vocab_size)
return vocab_size, word_freqs, dict_word2index, dict_index2word
def build_dataset(data,labels,dict_word2index,max_seq_len):
"""
基于词表构建离散化数据
:param data:
句子格式:[[word1,word2,word3....],[word11,word12,word13...]]
:param labels:
:param dict_word2index:
标签格式: [5,3]
:param max_seq_len: data中的文本最大的长度( 保证数据长度一致 ,不够的补0,否则截断)
:return:
离散化后的结果
datasets:[[3899,2,62....],[3,4,1...]]
labels:[5,3]
"""
dataset = []
indices = np.arange(len(labels))
for i in indices:
# i 第i行的数据
new_line = []
for word in data[i]: # data[i] 获取data列表中的第i个句子(列表类型)
if word in dict_word2index:
index = dict_word2index.get(word)
else:
index = dict_word2index.get('<UNK>')
new_line.append(index)
# 短句子: 最大的长度看齐
pad_num = max_seq_len - len(new_line)
while pad_num > 0:
new_line.append(dict_word2index.get('<PAD>'))
pad_num -= 1
# 长句子截断
dataset.append(new_line[:max_seq_len])
# 返回最终的结果
datasets, labels = np.array(dataset, dtype=np.int64), np.array(labels, dtype=np.int64)
return datasets, labels
def build_word_embedding(root_path,dict_word2index,emb_dim=300):
# 提取预训练词向量
pretrain_dir = root_path + 'sgns.sogou.char'
filename_trimmed_dir = root_path + 'embedding_SougouNews'
vocab_size = len(dict_word2index)
# 随机初始化词向量矩阵
print('*' * 60)
print("vocab_size = ", vocab_size)
print('emb_dim = ', emb_dim)
print('pretrain_dir = ', pretrain_dir)
print('filename_trimmed_dir = ', filename_trimmed_dir)
embeddings = np.random.rand(vocab_size, emb_dim)
print("embeddings shape=", embeddings.shape)
word_embedding = []
# 构建领域词向量
with open(pretrain_dir, 'r', encoding='utf-8') as f:
for i, line in enumerate(tqdm(f)):
if i == 0: # 若第一行是标题,则跳过
continue
splits = line.strip().split(" ") # 结构:[word xxx(词向量)]
word = splits[0]
if word in dict_word2index:
idx = dict_word2index[word] # 词对应index
feat = splits[1:] # 取出词向量的数据
emb = [float(x) for x in feat] # 转换成float类型
# <index,embedding>
embeddings[idx] = np.asarray(emb, dtype='float32') # 转换成数组
# <word,embedding>
word_embedding.append("{} {}".format(word, feat))
print('final embeddings = ', embeddings.shape)
# 保存 词典中每个单词的向量 <单词_idx,向量>
np.savez_compressed(filename_trimmed_dir, embeddings=embeddings)
print('*' * 60)
def build_dataset_online(data, dict_word2index, max_seq_len):
"""
在线构建pytorch 数据集
:param data: 在线预测文本数据
:param dict_word2index: 词表 dict类型
:param max_seq_len: 文章长度
:return:
文章对应的index
[[ 220 22 233 32 55 96 700 16 .... 1 1 1 1 ]]
"""
dataset = []
new_line = []
for word in data:
if word in dict_word2index:
index = dict_word2index[word]
else:
index = dict_word2index.get('<UNK>') # <UNK>
new_line.append(index)
pad_num = max_seq_len - len(new_line)
while pad_num > 0:
new_line.append(dict_word2index.get('<PAD>')) # <PAD>
pad_num -= 1
dataset.append(new_line[:max_seq_len])
datasets = np.array(dataset, dtype=np.int64)
return datasets
if __name__ == '__main__':
import jieba
data_path = 'data/news/data.txt'
tokenizer = lambda x: jieba.lcut(x);
data, labels, max_seq_len = load_data(data_path, tokenizer)
vocab_size, word_freqs, dict_word2index, dict_index2word = build_vocab(data, 100000, min_freq=1)
build_word_embedding('data/news/', dict_word2index)
TextCNN代码
# TextCNN
# 1. embedding layers 2. convoluational layers 3. max-pooling 4. softmax layers
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class Config(object):
"""
配置参数
"""
def __init__(self, dataset, embedding='random'):
# 数据路径
self.data_path = dataset + 'data.txt'
# 词表文件
self.vocab_file = dataset + 'word2index.pkl'
# 模型名称
self.model_name = 'TextCNN'
# 学习率
self.learning_rate = 0.001
# min-batch
self.batch_size = 256
# 类别名称
self.class_list = [x.strip() for x in open(dataset + 'class.txt', 'r', encoding='utf-8').readlines()]
# 类别个数
self.num_classes = len(self.class_list)
# 随机失活
self.dropout = 0.5
# 卷积核尺寸
self.filter_size = [2, 3, 4]
# 卷积核数量
self.num_filters = 256
# 批量数据加载
self.num_workers = 3
# 词表大小
self.max_vocab_size = 100000
# 最小词频数据
self.min_freq = 1
# 设备类型
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 初始化结果
self.vocab_size = self.max_vocab_size + 2 # 2:<UNK> <PAD>
# 句子的最大的长度(可视化业务数据分析)
self.max_seq_len = 21
# embedding 参数
self.embedding_pretrained = torch.tensor(np.load(dataset + embedding)['embeddings'].astype('float32')) \
if embedding != 'random' else None # shape: < vocab_size,embedding_size >
self.embedding_size = self.embedding_pretrained.size(1) if self.embedding_pretrained is not None else 300
# 保存日志文件
self.log_path = dataset + 'ckpts/' + self.model_name
# 训练的模型
self.save_path = dataset + 'ckpts/' + self.model_name + '.ckpt'
# 验证集loss超过1000batch没下降,结束训练
self.require_improvement = 1000
class Model(nn.Module):
"""
通过pytorch 定义一个模型
"""
def __init__(self, config):
super(Model, self).__init__()
# 1. embedding layers
if config.embedding_pretrained is not None:
# 基于外部词向量初始化我们的embedding
print('pre_trained init embedding')
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
# 随机模式
print('random init embedding')
self.embedding = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.embedding_size)
# 2. convolutional layers -> relu -> max-pooling
self.convs = nn.ModuleList(
nn.Sequential(
# convolutional layers
nn.Conv1d(in_channels=config.embedding_size, out_channels=config.num_filters, kernel_size=h),
# relu( 激活函数)
nn.ReLU(),
# 不同长度句子经过pooling 之后变成一个固定大小长度
nn.MaxPool1d(kernel_size=config.max_seq_len - h + 1)
) for h in config.filter_size
)
# 3 fc 定义一个全连接
## 输出每个类别的概率
self.fc = nn.Linear(in_features=config.num_filters * len(config.filter_size),
out_features=config.num_classes)
# dropout 参数
self.dropout = config.dropout
def forward(self, x):
"""
对我们的x进行预测
:param x:
输入数据的格式
tensor( [
[ 2,5,7,.......1],
[ 70,0,2,.......1],
] )
:return:
"""
# 1. x-> embedding
# < batch_size,max_seq_len,embedding_dim>
embed_x = self.embedding(x)
# 格式变化< batch_size,embedding_dim,max_seq_len>
embed_x = embed_x.permute(0, 2, 1)
# 2. convolutional layers
out = [conv(embed_x) for conv in self.convs]
out = torch.cat(out, dim=1)
out = out.view(-1, out.size(1))
out = F.dropout(input=out, p=self.dropout)
# 3. fc
out = self.fc(out)
return out
def predict(self, x):
"""
:param x: 文本行-> 离散化数据 词对应的index 下标索引位置
格式如下:
tensor([[ 2, 5, 8, 157, 1,............. 1]])
:return:
"""
embed_x = self.embedding(x)
embed_x = embed_x.permute(0, 2, 1)
out = [conv(embed_x) for conv in self.convs]
out = torch.cat(out, dim=1)
out = out.view(-1, out.size(1))
out = self.fc(out)
return out
utils代码
"""
模型训练相关方法工具类
"""
import time
from datetime import timedelta
import torch
import json
import torch.nn.functional as F
from tensorboardX import SummaryWriter
from sklearn import metrics
import numpy as np
def get_time_dif(start_time):
"""
获取已经使用的时间
:param start_time:
:return:
"""
end_time = time.time()
time_df = end_time - start_time
return timedelta(seconds= int(round(time_df)) )
def save_config(config, config_file):
"""
保持模型配置文件
参数以json format 存储
"""
with open(config_file, "w", encoding="utf8") as f:
json.dump(config, f, ensure_ascii=False, indent=4)
def load_config(config_file):
"""
加载模型配置文件
参数以json format存储
"""
with open(config_file, encoding="utf8") as f:
return json.load(f)
def train( num_epochs,config,model,train_loader,val_loader ):
"""
模型训练
:param num_epochs: 迭代次数
:param config: 配置参数
:param model: 模型
:param train_loader: 训练集
:param val_loader: 验证集
:return:
"""
last_best_batch = 0 # 记录上一次val最好的loss
total_batch = 0
val_best_loss = float('inf')
flag = False # 记录是否很久没有效果提升
start_time = time.time()
model.train()
optimizer = torch.optim.Adam(model.parameters(),lr = config.learning_rate)
writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
for epoch in range(num_epochs):
print( ' Epoch [{}/{}]'.format(epoch+1,num_epochs))
# train_loader 遍历,然后通过model 进行预测-> loss-> 更新权重->.....
# < outputs,labels> -> CrossEntropy(outputs,labels)-> loss
for i, (feats, labels) in enumerate(train_loader):
feats = feats.to(config.device)
labels = labels.to(config.device)
outputs = model(feats)
optimizer.zero_grad()
loss = F.cross_entropy(outputs,labels)
loss.backward() # loss 求剃度
optimizer.step()
total_batch += 1
# 每个total_batch - 在val_loader 进行验证 -> 好的model 保存
if total_batch % 10 == 0:
y_true = labels.data.cpu()
y_pred = torch.max(outputs.data,1)[1].cpu()# 预测结果
train_acc = metrics.accuracy_score(y_true,y_pred)
valid_acc,val_loss = val(config,model,val_loader)
# 保存最好的模型
if val_loss < val_best_loss:
val_best_loss = val_loss
torch.save(model.state_dict(),config.save_path)
is_best = '*'
last_best_batch=total_batch
else:
is_best = ''
#打印日志-> train acc /loss ,val acc/loss tensorboardX 可视化
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, ' \
' Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, val_loss, valid_acc, time_dif, is_best))
writer.add_scalar("loss/train", loss.item(), total_batch)
writer.add_scalar("loss/val", val_loss, total_batch)
writer.add_scalar("acc/train", train_acc, total_batch)
writer.add_scalar("acc/val", valid_acc, total_batch)
model.train()
# end batch
if total_batch - last_best_batch > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
# end for
if flag:
break
# epoch
writer.close()
test(config, model, val_loader)
def test(config, model, test_iter):
"""
测试数据进行test效果分析
:param config:
:param model:
:param test_iter:
:return:
"""
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss, test_report, test_confusion = val(config, model, test_iter, test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
print(msg.format(test_loss, test_acc))
print("Precision, Recall and F1-Score...")
print(test_report)
print("Confusion Matrix...")
print(test_confusion)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
def val(config,model,val_loader,test=False):
"""
评估效果的函数
:param config:
:param model:
:param val_loader:
:param test:
:return:
"""
model.eval()
loss_total = 0
predict_all = np.array([],dtype=int)
labels_all = np.array([],dtype=int)
for i, (feats, labels) in enumerate(val_loader):
feats = feats.to(config.device)
labels = labels.to(config.device)
outputs = model(feats)
with torch.no_grad():
loss = F.cross_entropy(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
preds = torch.max(outputs.data,1)[1].cpu().numpy()
labels_all = np.append(labels_all,labels)
predict_all = np.append(predict_all,preds)
acc = metrics.accuracy_score(labels_all,predict_all)
if test:
report = metrics.classification_report(labels_all,predict_all,
target_names=config.class_list,
digits=4)
confusion = metrics.confusion_matrix(labels_all,predict_all)
return acc,loss_total / len(val_loader),report,confusion
return acc,loss_total / len(val_loader)