第五章 深度学习
七、PaddlePaddle 中文实例
1. 中文文本分类
1.1 数据预处理
'''
数据来源:从网站上爬取56821条中文新闻摘要
数据类容:包含10类(国际、文化、娱乐、体育、财经、汽车、教育、科技、房产、证券)
'''
import os
from multiprocessing import cpu_count
import numpy as np
import paddle
import paddle.fluid as fluid
data_root = "data/news_classify/"
data_file = "news_classify_data.txt"
test_file = "test_list.txt"
train_file = "train_list.txt"
dict_file = "dict_txt.txt"
data_file_path = data_root + data_file
dict_file_path = data_root + dict_file
test_file_path = data_root + test_file
train_file_path = data_root + train_file
def create_dict():
dict_set = set()
with open(data_file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
title = line.split("_!_")[-1].replace("\n", "")
for w in title:
dict_set.add(w)
dict_list = []
i = 0
for s in dict_set:
dict_list.append([s, i])
i += 1
dict_txt = dict(dict_list)
end_dict = {"<unk>": i}
dict_txt.update(end_dict)
with open(dict_file_path, "w", encoding="utf-8") as f:
f.write(str(dict_txt))
print("生成字典完成.")
def line_encoding(title, dict_txt, label):
new_line = ""
for w in title:
if w in dict_txt:
code = str(dict_txt[w])
else:
code = str(dict_txt["<unk>"])
new_line = new_line + code + ","
new_line = new_line[:-1]
new_line = new_line + "\t" + label + "\n"
return new_line
def create_data_list():
with open(test_file_path, "w") as f:
pass
with open(train_file_path, "w") as f:
pass
with open(dict_file_path, "r", encoding="utf-8") as f_dict:
dict_txt = eval(f_dict.readlines()[0])
with open(data_file_path, "r", encoding="utf-8") as f_data:
lines = f_data.readlines()
i = 0
for line in lines:
words = line.replace("\n", "").split("_!_")
label = words[1]
title = words[3]
new_line = line_encoding(title, dict_txt, label)
if i % 10 == 0:
with open(test_file_path, "a", encoding="utf-8") as f:
f.write(new_line)
else:
with open(train_file_path, "a", encoding="utf-8") as f:
f.write(new_line)
i += 1
print("生成测试集、训练集结束.")
create_dict()
create_data_list()
1.2 模型训练与评估
def get_dict_len(dict_path):
with open(dict_path, "r", encoding="utf-8") as f:
line = eval(f.readlines()[0])
return len(line.keys())
def data_mapper(sample):
data, label = sample
val = [int(w) for w in data.split(",")]
return val, int(label)
def train_reader(train_file_path):
def reader():
with open(train_file_path, "r") as f:
lines = f.readlines()
np.random.shuffle(lines)
for line in lines:
data, label = line.split("\t")
yield data, label
return paddle.reader.xmap_readers(data_mapper,
reader,
cpu_count(),
1024)
def test_reader(test_file_path):
def reader():
with open(test_file_path, "r") as f:
lines = f.readlines()
for line in lines:
data, label = line.split("\t")
yield data, label
return paddle.reader.xmap_readers(data_mapper,
reader,
cpu_count(),
1024)
def CNN_net(data, dict_dim, class_dim=10, emb_dim=128, hid_dim=128, hid_dim2=98):
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
conv1 = fluid.nets.sequence_conv_pool(input=emb,
num_filters=hid_dim,
filter_size=3,
act="tanh",
pool_type="sqrt")
conv2 = fluid.nets.sequence_conv_pool(input=emb,
num_filters=hid_dim2,
filter_size=4,
act="tanh",
pool_type="sqrt")
output = fluid.layers.fc(input=[conv1, conv2],
size=class_dim,
act="softmax")
return output
model_save_dir = "model/news_classify/"
words = fluid.layers.data(name="words", shape=[1], dtype="int64",
lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
dict_dim = get_dict_len(dict_file_path)
model = CNN_net(words, dict_dim)
cost = fluid.layers.cross_entropy(input=model,
label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=model,
label=label)
test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.001)
optimizer.minimize(avg_cost)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
tr_reader = train_reader(train_file_path)
batch_train_reader = paddle.batch(reader=tr_reader, batch_size=128)
ts_reader = test_reader(test_file_path)
batch_test_reader = paddle.batch(reader=ts_reader, batch_size=128)
feeder = fluid.DataFeeder(place=place, feed_list=[words, label])
for pass_id in range(20):
for batch_id, data in enumerate(batch_train_reader()):
train_cost, train_acc = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
if batch_id % 100 == 0:
print("pass_id:%d, batch_id:%d, cost:%f, acc:%f" %
(pass_id, batch_id, train_cost[0], train_acc[0]))
test_costs_list = []
test_accs_list = []
for batch_id, data in enumerate(batch_test_reader()):
test_cost, test_acc = exe.run(program=test_program,
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
test_costs_list.append(test_cost[0])
test_accs_list.append(test_acc[0])
avg_test_cost = sum(test_costs_list) / len(test_costs_list)
avg_test_acc = sum(test_accs_list) / len(test_accs_list)
print("pass_id:%d, test_cost:%f, test_acc:%f" %
(pass_id, avg_test_cost, avg_test_acc))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
fluid.io.save_inference_model(model_save_dir,
feeded_var_names=[words.name],
target_vars=[model],
executor=exe)
print("模型保存完成.")
1.3 预测
model_save_dir = "model/news_classify/"
def get_data(sentence):
with open(dict_file_path, "r", encoding="utf-8") as f:
dict_txt = eval(f.readlines()[0])
keys = dict_txt.keys()
ret = []
for s in sentence:
if not s in keys:
s = "<unk>"
ret.append(int(dict_txt[s]))
return ret
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
print("加载模型")
infer_program, feeded_var_names, target_var = \
fluid.io.load_inference_model(dirname=model_save_dir, executor=exe)
texts = []
data1 = get_data("在获得诺贝尔文学奖7年之后,莫言15日晚间在山西汾阳贾家庄如是说")
data2 = get_data("综合'今日美国'、《世界日报》等当地媒体报道,芝加哥河滨警察局表示")
data3 = get_data("中国队无缘2020年世界杯")
data4 = get_data("中国人民银行今日发布通知,降低准备金率,预计释放4000亿流动性")
data5 = get_data("10月20日,第六届世界互联网大会正式开幕")
data6 = get_data("同一户型,为什么高层比低层要贵那么多?")
data7 = get_data("揭秘A股周涨5%资金动向:追捧2类股,抛售600亿香饽饽")
data8 = get_data("宋慧乔陷入感染危机,前夫宋仲基不戴口罩露面,身处国外神态轻松")
data9 = get_data("此盆栽花很好养,花美似牡丹,三季开花,南北都能养,很值得栽培")
texts.append(data1)
texts.append(data2)
texts.append(data3)
texts.append(data4)
texts.append(data5)
texts.append(data6)
texts.append(data7)
texts.append(data8)
texts.append(data9)
base_shape = [[len(c) for c in texts]]
tensor_words = fluid.create_lod_tensor(texts, base_shape, place)
result = exe.run(program=infer_program,
feed={feeded_var_names[0]: tensor_words},
fetch_list=target_var)
names = ["文化", "娱乐", "体育", "财经", "房产", "汽车", "教育", "科技", "国际", "证券"]
for i in range(len(texts)):
lab = np.argsort(result)[0][i][-1]
print("预测结果:%d, 名称:%s, 概率:%f" % (lab, names[lab], result[0][i][lab]))
2. 中文情绪分析
2.1 数据预处理与模型训练
''' 数据集介绍
中文酒店评论,7766笔数据,分为正面、负面评价
'''
import paddle
import paddle.dataset.imdb as imdb
import paddle.fluid as fluid
import numpy as np
import os
import random
from multiprocessing import cpu_count
mydict = {}
code = 1
data_file = "data/hotel_discuss2.csv"
dict_file = "data/hotel_dict.txt"
encoding_file = "data/hotel_encoding.txt"
puncts = " \n"
with open(data_file, "r", encoding="utf-8-sig") as f:
for line in f.readlines():
trim_line = line.strip()
for ch in trim_line:
if ch in puncts:
continue
if ch in mydict:
continue
elif len(ch) <= 0:
continue
else:
mydict[ch] = code
code += 1
code += 1
mydict["<unk>"] = code
with open(dict_file, "w", encoding="utf-8-sig") as f:
f.write(str(mydict))
print("数据字典保存完成!")
def load_dict():
with open(dict_file, "r", encoding="utf-8-sig") as f:
lines = f.readlines()
new_dict = eval(lines[0])
return new_dict
new_dict = load_dict()
with open(data_file, "r", encoding="utf-8-sig") as f:
with open(encoding_file, "w", encoding="utf-8-sig") as fw:
for line in f.readlines():
label = line[0]
remark = line[1:-1]
for ch in remark:
if ch in puncts:
continue
else:
fw.write(str(mydict[ch]))
fw.write(",")
fw.write("\t" + str(label) + "\n")
print("数据预处理完成")
def get_dict_len(dict_path):
with open(dict_path, 'r', encoding='utf-8-sig') as f:
lines = f.readlines()
new_dict = eval(lines[0])
return len(new_dict.keys())
def data_mapper(sample):
dt, lbl = sample
val = [int(word) for word in dt.split(",") if word.isdigit()]
return val, int(lbl)
def train_reader(train_list_path):
def reader():
with open(train_list_path, "r", encoding='utf-8-sig') as f:
lines = f.readlines()
np.random.shuffle(lines)
for line in lines:
data, label = line.split("\t")
yield data, label
return paddle.reader.xmap_readers(data_mapper,
reader,
cpu_count(),
1024)
def lstm_net(ipt, input_dim):
ipt = fluid.layers.reshape(ipt, [-1, 1],
inplace=True)
emb = fluid.layers.embedding(input=ipt, size=[input_dim, 128], is_sparse=True)
fc1 = fluid.layers.fc(input=emb, size=128)
lstm1, _ = fluid.layers.dynamic_lstm(input=fc1, size=128)
lstm2 = fluid.layers.sequence_pool(input=lstm1, pool_type="max")
conv = fluid.layers.sequence_pool(input=fc1, pool_type="max")
out = fluid.layers.fc([conv, lstm2], size=2, act="softmax")
return out
dict_len = get_dict_len(dict_file)
rmk = fluid.layers.data(name="rmk", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
model = lstm_net(rmk, dict_len)
cost = fluid.layers.cross_entropy(input=model, label=label)
avg_cost = fluid.layers.mean(cost)
acc = fluid.layers.accuracy(input=model, label=label)
optimizer = fluid.optimizer.AdagradOptimizer(learning_rate=0.001)
opt = optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
reader = train_reader(encoding_file)
batch_train_reader = paddle.batch(reader, batch_size=128)
feeder = fluid.DataFeeder(place=place, feed_list=[rmk, label])
for pass_id in range(40):
for batch_id, data in enumerate(batch_train_reader()):
train_cost, train_acc = exe.run(program=fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
if batch_id % 20 == 0:
print("pass_id: %d, batch_id: %d, cost: %0.5f, acc:%.5f" %
(pass_id, batch_id, train_cost[0], train_acc))
print("模型训练完成......")
model_save_dir = "model/chn_emotion_analyses.model"
if not os.path.exists(model_save_dir):
print("create model path")
os.makedirs(model_save_dir)
fluid.io.save_inference_model(model_save_dir,
feeded_var_names=[rmk.name],
target_vars=[model],
executor=exe)
print("模型保存完成, 保存路径: ", model_save_dir)
2.2 预测
import paddle
import paddle.fluid as fluid
import numpy as np
import os
import random
from multiprocessing import cpu_count
data_file = "data/hotel_discuss2.csv"
dict_file = "data/hotel_dict.txt"
encoding_file = "data/hotel_encoding.txt"
model_save_dir = "model/chn_emotion_analyses.model"
def load_dict():
with open(dict_file, "r", encoding="utf-8-sig") as f:
lines = f.readlines()
new_dict = eval(lines[0])
return new_dict
def encode_by_dict(remark, dict_encoded):
remark = remark.strip()
if len(remark) <= 0:
return []
ret = []
for ch in remark:
if ch in dict_encoded:
ret.append(dict_encoded[ch])
else:
ret.append(dict_encoded["<unk>"])
return ret
lods = []
new_dict = load_dict()
lods.append(encode_by_dict("总体来说房间非常干净,卫浴设施也相当不错,交通也比较便利", new_dict))
lods.append(encode_by_dict("酒店交通方便,环境也不错,正好是我们办事地点的旁边,感觉性价比还可以", new_dict))
lods.append(encode_by_dict("设施还可以,服务人员态度也好,交通还算便利", new_dict))
lods.append(encode_by_dict("酒店服务态度极差,设施很差", new_dict))
lods.append(encode_by_dict("我住过的最不好的酒店,以后决不住了", new_dict))
lods.append(encode_by_dict("说实在的我很失望,我想这家酒店以后无论如何我都不会再去了", new_dict))
base_shape = [[len(c) for c in lods]]
place = fluid.CPUPlace()
infer_exe = fluid.Executor(place)
infer_exe.run(fluid.default_startup_program())
tensor_words = fluid.create_lod_tensor(lods, base_shape, place)
infer_program, feed_target_names, fetch_targets = fluid.io.load_inference_model(dirname=model_save_dir, executor=infer_exe)
results = infer_exe.run(program=infer_program,
feed={feed_target_names[0]: tensor_words},
fetch_list=fetch_targets)
for i, r in enumerate(results[0]):
print("负面: %0.5f, 正面: %0.5f" % (r[0], r[1]))