本文参考复旦大学自然语言处理入门练习,主要是实现用tensorflow实现基于logistic/softmax regression的文本分类。
环境:python3.7
版本:tensorflow1.13
数据集:
数据集采用gaussic的数据集,https://github.com/gaussic/text-classification-cnn-rnn
链接: https://pan.baidu.com/s/1hugrfRu 密码: qfud
下载文件中的cnews.train.txt,放到data目录下即可
数据集有十个类别,分别是:
体育|娱乐|家居|房产|教育|时尚|时政|游戏|科技|财经
配置文件
config文件夹里描述了配置文件的路径信息和变量:
cnews.train.txt 为训练数据集
stopwords.txt 为停用词数据
categories.txt 为分类数据
best_validation 所保存的最好的模型
import os
pwd_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
class LrConfig(object):
# 训练模型用到的路径
dataset_path = os.path.join(pwd_path + '/data' + "/cnews.train.txt")
stopwords_path = os.path.join(pwd_path + '/data' + "/stopwords.txt")
tfidf_model_save_path = os.path.join(pwd_path + '/model' + "/tfidf_model.m")
categories_save_path = os.path.join(pwd_path + '/data' + '/categories.txt')
lr_save_dir = os.path.join(pwd_path + '/model' + "/checkpoints")
lr_save_path = os.path.join(lr_save_dir, 'best_validation')
# 变量
num_epochs = 100 # 总迭代轮次
num_classes = 10 # 类别数
print_per_batch = 10 # 每多少轮输出一次结果
数据集
data文件夹:
categories.txt:
cnews.train.txt:
stopwords.txt:
数据预处理
datahelper文件夹:
data_process.py:
from config.lr_config import LrConfig
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals import joblib
import jieba
import numpy as np
config = LrConfig()
class DataProcess(object):
def __init__(self, dataset_path=None, stopwords_path=None, model_save_path=None):
self.dataset_path = dataset_path
self.stopwords_path = stopwords_path
self.model_save_path = model_save_path
def read_data(self):
"""读取数据"""
stopwords = list()
with open(self.dataset_path, encoding='utf-8') as f1:
data = f1.readlines()
with open(self.stopwords_path, encoding='utf-8') as f2:
temp_stopwords = f2.readlines()
for word in temp_stopwords:
stopwords.append(word[:-1])
return data, stopwords
def save_categories(self, data, save_path):
"""将文本的类别写到本地"""
with open(save_path, 'w', encoding='utf-8') as f:
f.write('|'.join(data))
def pre_data(self, data, stopwords, test_size=0.2):
"""数据预处理"""
label_list = list()
text_list = list()
for line in data:
label, text = line.split('\t', 1)
# print(label)
seg_text = [word for word in jieba.cut(text) if word not in stopwords]
text_list.append(' '.join(seg_text))
label_list.append(label)
# 标签转化为one-hot格式
encoder_nums = LabelEncoder()
label_nums = encoder_nums.fit_transform(label_list)
categories = list(encoder_nums.classes_)
self.save_categories(categories, config.categories_save_path)
label_nums = np.array([label_nums]).T
encoder_one_hot = OneHotEncoder()
label_one_hot = encoder_one_hot.fit_transform(label_nums)
label_one_hot = label_one_hot.toarray()
return model_selection.train_test_split(text_list, label_one_hot, test_size=test_size, random_state=1024)
# TODO:后续做
def get_bow(self):
"""提取词袋模型特征"""
pass
# TODO:这里可能出现维度过大,内存不足的问题,目前是去除低频词解决,可以做lda或者pca降维(后续做)
def get_tfidf(self, X_train, X_test):
"""提取tfidf特征"""
vectorizer = TfidfVectorizer(min_df=100)
vectorizer.fit_transform(X_train)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, vectorizer
# TODO:后续做
def get_word2vec(self):
"""提取word2vec特征"""
pass
def provide_data(self):
"""提供数据"""
data, stopwords = self.read_data()
# 1、提取bag of word参数
# 2、提取tf-idf特征参数
X_train, X_test, y_train, y_test = self.pre_data(data, stopwords, test_size=0.2)
X_train_vec, X_test_vec, vectorizer = self.get_tfidf(X_train, X_test)
joblib.dump(vectorizer, self.model_save_path)
# 3、提取word2vec特征参数
return X_train_vec, X_test_vec, y_train, y_test
def batch_iter(self, x, y, batch_size=64):
"""迭代器,将数据分批传给模型"""
data_len = len(x)
num_batch = int((data_len-1)/batch_size)+1
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i*batch_size
end_id = min((i+1)*batch_size, data_len)
yield x_shuffle[start_id: end_id], y_shuffle[start_id: end_id]
model
checkpoints为保存训练的模型
模型
lr_model.py:
用softmax得到的分类模型,交叉熵损失函数,准确率
import tensorflow as tf
class LrModel(object):
def __init__(self, config, seq_length):
self.config = config
self.seq_length = seq_length
self.lr()
def lr(self):
self.x = tf.placeholder(tf.float32, [None, self.seq_length])
w = tf.Variable(tf.zeros([self.seq_length, self.config.num_classes]))
b = tf.Variable(tf.zeros([self.config.num_classes]))
# softmax
y = tf.nn.softmax(tf.matmul(self.x, w) + b)
# 对矩阵按行或列计算最大值,输出最大值的下标,1为按行
self.y_pred_cls = tf.argmax(y, 1)
# 交叉熵损失
self.y_ = tf.placeholder(tf.float32, [None, self.config.num_classes])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.y_ * tf.log(y), reduction_indices=[1]))
self.loss = tf.reduce_mean(cross_entropy)
self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
# 利用tf.argmax()按行求出真实值y_、预测值y最大值的下标,用tf.equal()求出真实值和预测值相等的数量,也就是预测结果正确的数量
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(self.y_, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
主函数
main.py:
读取数据,训练模型,评估模型,保存模型
import time
from datetime import timedelta
from datahelper.data_process import DataProcess
from config.lr_config import LrConfig
from lr_model import LrModel
import tensorflow as tf
def get_time_dif(start_time):
"""获取已经使用的时间"""
end_time = time.time()
time_dif = end_time-start_time
return timedelta(seconds=int(round(time_dif)))
def evaluate(sess, x_, y_):
"""测试集上准曲率评估"""
data_len = len(x_)
batch_eval = data_get.batch_iter(x_, y_, 128)
total_loss = 0
total_acc = 0
for batch_xs, batch_ys in batch_eval:
batch_len = len(batch_xs)
loss, acc = sess.run([model.loss, model.accuracy], feed_dict={model.x: batch_xs, model.y_: batch_ys})
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss/data_len, total_acc/data_len
def get_data():
# 读取数据集
print("Loading training and validation data...")
X_train, X_test, y_train, y_test = data_get.provide_data()
X_train = X_train.toarray()
X_test = X_test.toarray()
return X_train, X_test, y_train, y_test, len(X_train[0])
def train(X_train, X_test, y_train, y_test):
# 配置Saver
saver = tf.train.Saver()
# 训练模型
print("Training and evaluating...")
start_time = time.time()
total_batch = 0 # 总批次
best_acc_val = 0.0 # 最佳验证集准确率
last_improved = 0 # 记录上一次提升批次
require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练
flag = False
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for step in range(config.num_epochs):
batch_train = data_get.batch_iter(X_train, y_train)
for batch_xs, batch_ys in batch_train:
if total_batch % config.print_per_batch == 0:
loss_train, acc_train = sess.run([model.loss, model.accuracy], feed_dict={model.x: X_train, model.y_: y_train})
loss_val, acc_val = evaluate(sess, X_test, y_test)
if acc_val > best_acc_val:
# 保存最好结果
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=sess, save_path=config.lr_save_path)
improve_str = "*"
else:
improve_str = ""
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%}, '\
+ 'Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improve_str))
sess.run(model.train_step, feed_dict={model.x: batch_xs, model.y_: batch_ys})
total_batch += 1
if total_batch - last_improved > require_improvement:
# 验证集准确率长期不提升,提前结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
# TODO:后续有需要再做
def test():
"""
目前直接输入一个语料,分为训练集和验证集合
也可以输入两个,一个训练集用sklearn分为训练集和验证集,单独找一个验证集再这测试
还可以输入训练集、验证集、测试集,测试集在这做测试
"""
pass
if __name__ == "__main__":
config = LrConfig()
data_get = DataProcess(config.dataset_path, config.stopwords_path, config.tfidf_model_save_path)
X_train, X_test, y_train, y_test, seq_length = get_data()
model = LrModel(config, seq_length)
train(X_train, X_test, y_train, y_test)
预测
predict.py:
TensorFlow通过tf.train.Saver类实现神经网络模型的保存和提取
saver = tf.train.Saver() // 先要创建一个Saver对象
saver.save(sess=sess, save_path=config.lr_save_path) // # saver.save,保存模型saver.restore(sess=session, save_path=config.lr_save_path) //
saver.restore, 模型提取,重载模型的参数,继续训练或用于测试数据
用模型把一条数据进行预测:
import tensorflow as tf
#from sklearn.externals import joblib
import joblib
import jieba
from config.lr_config import LrConfig
from lr_model import LrModel
def pre_data(data, config):
"""分词去停用词"""
stopwords = list()
text_list = list()
with open(config.stopwords_path, 'r', encoding='utf-8') as f:
for word in f.readlines():
stopwords.append(word[:-1])
seg_text = jieba.cut(data)
text = [word for word in seg_text if word not in stopwords]
text_list.append(' '.join(text))
return text_list
def read_categories():
"""读取类别"""
with open(config.categories_save_path, 'r', encoding='utf-8') as f:
categories = f.readlines()
return categories[0].split('|')
def predict_line(data, categories):
"""预测结果"""
session = tf.Session()
session.run(tf.global_variables_initializer())
# 先要创建一个Saver对象
saver = tf.train.Saver()
# saver.restore, 模型提取,重载模型的参数,继续训练或用于测试数据
saver.restore(sess=session, save_path=config.lr_save_path)
y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data})
return categories[y_pred_cls[0]]
if __name__ == "__main__":
data = "三星ST550以全新的拍摄方式超越了以往任何一款数码相机"
config = LrConfig()
line = pre_data(data, config)
tfidf_model = joblib.load(config.tfidf_model_save_path)
X_test = tfidf_model.transform(line).toarray()
model = LrModel(config, len(X_test[0]))
categories = read_categories()
print(predict_line(X_test, categories))
结果
主函数运行结果:
总共训练了13640次,时间56分钟,最好的结果为
Iter: 12640, Train Loss: 0.25, Train Acc: 95.61%, Val Loss: 0.26, Val Acc: 95.63%, Time: 0:52:48 *
训练集损失:0.25,训练集准确度:98.61%
验证集损失:0.26,验证集准确度:95.63%
测试函数运行结果:
测试数据为:
data = “三星ST550以全新的拍摄方式超越了以往任何一款数码相机”
参考代码
https://github.com/Alic-yuan/nlp-beginner-finish/tree/master/task1