文本分类可以用在NLP的很多领域,比如情感分析,意图识别,领域识别等等,先总结一波使用paddle进行文本分类的整体流程
NLP任务的整体流程一般如下:
1.数据处理(将数据变成可以放入到模型的格式)
2.模型构建(构建你想使用的模型)
3.训练和评估模型
本文使用paddle这个框架来实现一般模型的文本分类和后续使用ernie这种预训练模型来进行文本分类
1.数据处理部分
这个部分包括从文件中读取数据(训练集)
然后处理数据一般包括padding (类似长度不够补0或着过长文本截断)和 映射(将tokens、lable映射到词表字典的index和label字典的index)这部分其实和框架没太大的关系,但是按框架的流程走会清晰很多。
在paddle中的reader就是完成数据生成的功能,生成出模型对应的数据格式,本文使用情感分析为例。
数据格式如下:
上面可以看出context部分是中文,我们需要将中文转成index(数字),而lable部分已经是[1,0]这种格式就没必要再转了,如果label是[正向,负向]这种,需要将其转成[1,0]。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import io
import sys
import random
import numpy as np
import os
from paddle import fluid
#读取字典文件生成字典
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
wid = 0
for line in f:
if line.strip() not in vocab:
vocab[line.strip()] = wid
wid += 1
vocab["<unk>"] = len(vocab)
return vocab
#定义生成器reader
def data_reader(file_path, word_dict, num_examples, phrase, epoch, max_seq_len):
"""
Convert word sequence into slot
"""
unk_id = word_dict.get('<unk>')
pad_id = 0
all_data = []
with io.open(file_path, "r", encoding='utf8') as fin:
for line in fin:
if line.startswith('label'):
continue
cols = line.strip().split("\t")
if len(cols) != 2:
sys.stderr.write("[NOTICE] Error Format Line!")
continue
label = int(cols[0])
wids = [word_dict[x] if x in word_dict else unk_id
for x in cols[1].split(" ")]
seq_len = len(wids)
if seq_len < max_seq_len:
for i in range(max_seq_len - seq_len):
wids.append(pad_id)
else:
wids = wids[:max_seq_len]
seq_len = max_seq_len
all_data.append((wids, label, seq_len))
if phrase == "train":
random.shuffle(all_data)
num_examples[phrase] = len(all_data)
def reader():
"""
Reader Function
"""
for epoch_index in range(epoch):
for doc, label, seq_len in all_data:
yield doc, label, seq_len
return reader
#定义自己的数据生成器
class SentaProcessor(object):
"""
Processor class for data convertors for senta
"""
def __init__(self,
data_dir,
vocab_path,
random_seed,
max_seq_len):
self.data_dir = data_dir
self.vocab = load_vocab(vocab_path)
self.num_examples = {"train": -1, "dev": -1, "infer": -1}
np.random.seed(random_seed)
self.max_seq_len = max_seq_len
def get_train_examples(self, data_dir, epoch, max_seq_len):
"""
Load training examples
"""
return data_reader((self.data_dir + "/train.tsv"), self.vocab, self.num_examples, "train", epoch, max_seq_len)
def get_dev_examples(self, data_dir, epoch, max_seq_len):
"""
Load dev examples
"""
return data_reader((self.data_dir + "/dev.tsv"), self.vocab, self.num_examples, "dev", epoch, max_seq_len)
def get_test_examples(self, data_dir, epoch, max_seq_len):
"""
Load test examples
"""
return data_reader((self.data_dir + "/test.tsv"), self.vocab, self.num_examples, "infer", epoch, max_seq_len)
def get_labels(self):
"""
Return Labels
"""
return ["0", "1","2"]
def get_num_examples(self, phase):
"""
Return num of examples in train, dev, test set
"""
if phase not in ['train', 'dev', 'infer']:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'infer'].")
return self.num_examples[phase]
def get_train_progress(self):
"""
Get train progress
"""
return self.current_train_example, self.current_train_epoch
#生成batch
def data_generator(self, batch_size, phase='train', epoch=1, shuffle=True):
"""
Generate data for train, dev or infer
"""
if phase == "train":
return fluid.io.batch(self.get_train_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
#return self.get_train_examples(self.data_dir, epoch, self.max_seq_len)
elif phase == "dev":
return fluid.io.batch(self.get_dev_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
elif phase == "infer":
return fluid.io.batch(self.get_test_examples(self.data_dir, epoch, self.max_seq_len), batch_size)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'infer'].")
if __name__ == '__main__':
dev_path='data/dev.tsv'
word_dict=load_vocab('data/vocab.txt')
num_examples={'train':-1,'dev':-1,'test':-1}
reader=data_reader(dev_path,word_dict,num_examples,'dev',1,24)
print(next(reader))
2.构建模型
定义模型结构,用最简单的bow_net举例
import paddle.fluid as fluid
def bow_net(data,
seq_len,
label,
dict_dim,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2,
is_prediction=False):
"""
Bow net
"""
# embedding layer
emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
emb = fluid.layers.sequence_unpad(emb, length=seq_len)
# bow layer
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
# full connect layer
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
# softmax layer
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
if is_prediction:
return prediction
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, prediction
这个模型就是将一个句子中的词向量进行相加再进过全连接层和softmax进行分类
3.训练和评估模型
先吧模型的输入和输出,以及使用哪个模型进行创建(都是走流程,这里只有一个bow_net模型)
from bow_net_blog import bow_net
import reader_blog as reader
from paddle import fluid
import argparse
import os
import time
import numpy as np
def create_model(args, pyreader_name, num_labels, is_prediction=False):
data = fluid.data(
name="src_ids", shape=[None, args.max_seq_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype="int64")
seq_len = fluid.data(name="seq_len", shape=[None], dtype="int64")
data_reader = fluid.io.DataLoader.from_generator(
feed_list=[data, label, seq_len], capacity=4, iterable=False)
network = bow_net
if is_prediction:
probs = network(
data, seq_len, None, args.vocab_size, is_prediction=is_prediction)
print("create inference model...")
return data_reader, probs, [data.name, seq_len.name]
ce_loss, probs = network(
data, seq_len, label, args.vocab_size, is_prediction=is_prediction)
# loss = fluid.layers.mean(x=ce_loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=label, total=num_seqs)
return data_reader, ce_loss, accuracy, num_seqs
然后进行数据读取和训练
def main():
args=get_args()
"""
Main Function
"""
if args.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = 1
exe = fluid.Executor(place)
# task_name = args.task_name.lower()
processor = reader.SentaProcessor(
data_dir=args.data_dir,
vocab_path=args.vocab_path,
random_seed=args.random_seed,
max_seq_len=args.max_seq_len)
num_labels = len(processor.get_labels())
if not (args.do_train or args.do_val or args.do_infer):
raise ValueError("For args `do_train`, `do_val` and `do_infer`, at "
"least one of them must be True.")
startup_prog = fluid.Program()
if args.random_seed is not None:
startup_prog.random_seed = args.random_seed
if args.do_train:
train_data_generator = processor.data_generator(
batch_size=args.batch_size / dev_count,
phase='train',
epoch=args.epoch,
shuffle=True)
num_train_examples = processor.get_num_examples(phase="train")
max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
print("Device count: %d" % dev_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
train_program = fluid.Program()
# if args.enable_ce and args.random_seed is not None:
# train_program.random_seed = args.random_seed
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_reader, loss, accuracy, num_seqs = create_model(
args,
pyreader_name='train_reader',
num_labels=num_labels,
is_prediction=False)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr)
sgd_optimizer.minimize(loss)
if args.verbose:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
program=train_program, batch_size=args.batch_size)
print("Theoretical memory usage in training: %.3f - %.3f %s" %
(lower_mem, upper_mem, unit))
if args.do_val:
test_data_generator = processor.data_generator(
batch_size=args.batch_size / dev_count,
phase='dev',
epoch=1,
shuffle=False)
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_reader, loss, accuracy, num_seqs = create_model(
args,
pyreader_name='test_reader',
num_labels=num_labels,
is_prediction=False)
test_prog = test_prog.clone(for_test=True)
if args.do_infer:
infer_data_generator = processor.data_generator(
batch_size=args.batch_size / dev_count,
phase='infer',
epoch=1,
shuffle=False)
infer_prog = fluid.Program()
with fluid.program_guard(infer_prog, startup_prog):
with fluid.unique_name.guard():
infer_reader, prop, _ = create_model(
args,
pyreader_name='infer_reader',
num_labels=num_labels,
is_prediction=True)
infer_prog = infer_prog.clone(for_test=True)
exe.run(startup_prog)
if args.do_train:
if args.init_checkpoint:
init_checkpoint(
exe, args.init_checkpoint, main_program=startup_prog)
elif args.do_val or args.do_infer:
if not args.init_checkpoint:
raise ValueError("args 'init_checkpoint' should be set if"
"only doing validation or testing!")
init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog)
if args.do_train:
train_exe = exe
train_reader.set_sample_list_generator(train_data_generator)
else:
train_exe = None
if args.do_val:
test_exe = exe
test_reader.set_sample_list_generator(test_data_generator)
if args.do_infer:
test_exe = exe
infer_reader.set_sample_list_generator(infer_data_generator)
if args.do_train:
train_reader.start()
steps = 0
total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time()
while True:
try:
steps += 1
#print("steps...")
if steps % args.skip_steps == 0:
fetch_list = [loss.name, accuracy.name, num_seqs.name]
else:
fetch_list = []
outputs = train_exe.run(program=train_program,
fetch_list=fetch_list,
return_numpy=False)
#print("finished one step")
if steps % args.skip_steps == 0:
np_loss, np_acc, np_num_seqs = outputs
np_loss = np.array(np_loss)
np_acc = np.array(np_acc)
np_num_seqs = np.array(np_num_seqs)
total_cost.extend(np_loss * np_num_seqs)
total_acc.extend(np_acc * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
if args.verbose:
verbose = "train pyreader queue size: %d, " % train_reader.queue.size(
)
print(verbose)
time_end = time.time()
used_time = time_end - time_begin
print("step: %d, ave loss: %f, "
"ave acc: %f, speed: %f steps/s" %
(steps, np.sum(total_cost) / np.sum(total_num_seqs),
np.sum(total_acc) / np.sum(total_num_seqs),
args.skip_steps / used_time))
total_cost, total_acc, total_num_seqs = [], [], []
time_begin = time.time()
if steps % args.save_steps == 0:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps), "checkpoint")
fluid.save(train_program, save_path)
if steps % args.validation_steps == 0:
# evaluate dev set
if args.do_val:
print("do evalatation")
evaluate(exe, test_prog, test_reader,
[loss.name, accuracy.name, num_seqs.name],
"dev")
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps),
"checkpoint")
fluid.save(train_program, save_path)
train_reader.reset()
break
# final eval on dev set
if args.do_val:
print("Final validation result:")
evaluate(exe, test_prog, test_reader,
[loss.name, accuracy.name, num_seqs.name], "dev")
# final eval on test set
if args.do_infer:
print("Final test result:")
inference(exe, infer_prog, infer_reader, [prop.name], "infer")
以上就是在paddle中训练自己模型的具体流程了
代码和数据集可以进入:
https://github.com/lcyuanjiang/paddle_nlp