自然语言处理入门实战1:基于机器学习的文本分类


本文参考复旦大学自然语言处理入门练习,主要是实现用tensorflow实现基于logistic/softmax regression的文本分类。
环境:python3.7
版本:tensorflow1.13
数据集:
数据集采用gaussic的数据集,https://github.com/gaussic/text-classification-cnn-rnn
链接: https://pan.baidu.com/s/1hugrfRu 密码: qfud
下载文件中的cnews.train.txt,放到data目录下即可
数据集有十个类别,分别是:
体育|娱乐|家居|房产|教育|时尚|时政|游戏|科技|财经

配置文件

config文件夹里描述了配置文件的路径信息和变量:
cnews.train.txt 为训练数据集
stopwords.txt 为停用词数据
categories.txt 为分类数据

best_validation 所保存的最好的模型

import os

pwd_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

class LrConfig(object):
    #  训练模型用到的路径
    dataset_path = os.path.join(pwd_path + '/data' + "/cnews.train.txt")
    stopwords_path = os.path.join(pwd_path + '/data' + "/stopwords.txt")
    tfidf_model_save_path = os.path.join(pwd_path + '/model' + "/tfidf_model.m")
    categories_save_path = os.path.join(pwd_path + '/data' + '/categories.txt')

    lr_save_dir = os.path.join(pwd_path + '/model' + "/checkpoints")
    lr_save_path = os.path.join(lr_save_dir, 'best_validation')
    #  变量
    num_epochs = 100  # 总迭代轮次
    num_classes = 10  # 类别数
    print_per_batch = 10  # 每多少轮输出一次结果

数据集

data文件夹:
categories.txt:
在这里插入图片描述
cnews.train.txt:
在这里插入图片描述
stopwords.txt:
在这里插入图片描述

数据预处理

datahelper文件夹:
data_process.py:

from config.lr_config import LrConfig
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.externals import joblib
import jieba
import numpy as np

config = LrConfig()

class DataProcess(object):
    def __init__(self, dataset_path=None, stopwords_path=None, model_save_path=None):
        self.dataset_path = dataset_path
        self.stopwords_path = stopwords_path
        self.model_save_path = model_save_path

    def read_data(self):
        """读取数据"""
        stopwords = list()
        with open(self.dataset_path, encoding='utf-8') as f1:
            data = f1.readlines()
        with open(self.stopwords_path, encoding='utf-8') as f2:
            temp_stopwords = f2.readlines()
        for word in temp_stopwords:
            stopwords.append(word[:-1])
        return data, stopwords

    def save_categories(self, data, save_path):
        """将文本的类别写到本地"""
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write('|'.join(data))

    def pre_data(self, data, stopwords, test_size=0.2):
        """数据预处理"""
        label_list = list()
        text_list = list()
        for line in data:
            label, text = line.split('\t', 1)
            # print(label)
            seg_text = [word for word in jieba.cut(text) if word not in stopwords]
            text_list.append(' '.join(seg_text))
            label_list.append(label)
        # 标签转化为one-hot格式
        encoder_nums = LabelEncoder()
        label_nums = encoder_nums.fit_transform(label_list)
        categories = list(encoder_nums.classes_)
        self.save_categories(categories, config.categories_save_path)
        label_nums = np.array([label_nums]).T
        encoder_one_hot = OneHotEncoder()
        label_one_hot = encoder_one_hot.fit_transform(label_nums)
        label_one_hot = label_one_hot.toarray()
        return model_selection.train_test_split(text_list, label_one_hot, test_size=test_size, random_state=1024)

    # TODO:后续做
    def get_bow(self):
        """提取词袋模型特征"""
        pass

    # TODO:这里可能出现维度过大,内存不足的问题,目前是去除低频词解决,可以做lda或者pca降维(后续做)
    def get_tfidf(self, X_train, X_test):
        """提取tfidf特征"""
        vectorizer = TfidfVectorizer(min_df=100)
        vectorizer.fit_transform(X_train)
        X_train_vec = vectorizer.transform(X_train)
        X_test_vec = vectorizer.transform(X_test)
        return X_train_vec, X_test_vec, vectorizer

    # TODO:后续做
    def get_word2vec(self):
        """提取word2vec特征"""
        pass

    def provide_data(self):
        """提供数据"""
        data, stopwords = self.read_data()
        #  1、提取bag of word参数
        #  2、提取tf-idf特征参数
        X_train, X_test, y_train, y_test = self.pre_data(data, stopwords, test_size=0.2)
        X_train_vec, X_test_vec, vectorizer = self.get_tfidf(X_train, X_test)
        joblib.dump(vectorizer, self.model_save_path)
        #  3、提取word2vec特征参数
        return X_train_vec, X_test_vec, y_train, y_test

    def batch_iter(self, x, y, batch_size=64):
        """迭代器,将数据分批传给模型"""
        data_len = len(x)
        num_batch = int((data_len-1)/batch_size)+1
        indices = np.random.permutation(np.arange(data_len))
        x_shuffle = x[indices]
        y_shuffle = y[indices]
        for i in range(num_batch):
            start_id = i*batch_size
            end_id = min((i+1)*batch_size, data_len)
            yield x_shuffle[start_id: end_id], y_shuffle[start_id: end_id]

model

checkpoints为保存训练的模型

模型

lr_model.py:
用softmax得到的分类模型,交叉熵损失函数,准确率

import tensorflow as tf

class LrModel(object):
    def __init__(self, config, seq_length):
        self.config = config
        self.seq_length = seq_length
        self.lr()

    def lr(self):
        self.x = tf.placeholder(tf.float32, [None, self.seq_length])
        w = tf.Variable(tf.zeros([self.seq_length, self.config.num_classes]))
        b = tf.Variable(tf.zeros([self.config.num_classes]))
        # softmax
        y = tf.nn.softmax(tf.matmul(self.x, w) + b)
        # 对矩阵按行或列计算最大值,输出最大值的下标,1为按行
        self.y_pred_cls = tf.argmax(y, 1)
        # 交叉熵损失
        self.y_ = tf.placeholder(tf.float32, [None, self.config.num_classes])
        cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.y_ * tf.log(y), reduction_indices=[1]))
        self.loss = tf.reduce_mean(cross_entropy)

        self.train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
        # 利用tf.argmax()按行求出真实值y_、预测值y最大值的下标,用tf.equal()求出真实值和预测值相等的数量,也就是预测结果正确的数量
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(self.y_, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

主函数

main.py:
读取数据,训练模型,评估模型,保存模型

import time
from datetime import timedelta
from datahelper.data_process import DataProcess
from config.lr_config import LrConfig
from lr_model import LrModel
import tensorflow as tf

def get_time_dif(start_time):
    """获取已经使用的时间"""
    end_time = time.time()
    time_dif = end_time-start_time
    return timedelta(seconds=int(round(time_dif)))


def evaluate(sess, x_, y_):
    """测试集上准曲率评估"""
    data_len = len(x_)
    batch_eval = data_get.batch_iter(x_, y_, 128)
    total_loss = 0
    total_acc = 0
    for batch_xs, batch_ys in batch_eval:
        batch_len = len(batch_xs)
        loss, acc = sess.run([model.loss, model.accuracy], feed_dict={model.x: batch_xs, model.y_: batch_ys})
        total_loss += loss * batch_len
        total_acc += acc * batch_len
    return total_loss/data_len, total_acc/data_len


def get_data():
    # 读取数据集
    print("Loading training and validation data...")
    X_train, X_test, y_train, y_test = data_get.provide_data()
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    return X_train, X_test, y_train, y_test, len(X_train[0])


def train(X_train, X_test, y_train, y_test):
    # 配置Saver
    saver = tf.train.Saver()
    # 训练模型
    print("Training and evaluating...")
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练
    flag = False
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for step in range(config.num_epochs):
            batch_train = data_get.batch_iter(X_train, y_train)
            for batch_xs, batch_ys in batch_train:
                if total_batch % config.print_per_batch == 0:
                    loss_train, acc_train = sess.run([model.loss, model.accuracy], feed_dict={model.x: X_train, model.y_: y_train})
                    loss_val, acc_val = evaluate(sess, X_test, y_test)

                    if acc_val > best_acc_val:
                        # 保存最好结果
                        best_acc_val = acc_val
                        last_improved = total_batch
                        saver.save(sess=sess, save_path=config.lr_save_path)
                        improve_str = "*"
                    else:
                        improve_str = ""
                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%}, '\
                           + 'Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improve_str))
                sess.run(model.train_step, feed_dict={model.x: batch_xs, model.y_: batch_ys})
                total_batch += 1

                if total_batch - last_improved > require_improvement:
                    #  验证集准确率长期不提升,提前结束训练
                    print("No optimization for a long time, auto-stopping...")
                    flag = True
                    break
            if flag:
                break

# TODO:后续有需要再做
def test():
    """
    目前直接输入一个语料,分为训练集和验证集合
    也可以输入两个,一个训练集用sklearn分为训练集和验证集,单独找一个验证集再这测试
    还可以输入训练集、验证集、测试集,测试集在这做测试
    """
    pass

if __name__ == "__main__":
    config = LrConfig()
    data_get = DataProcess(config.dataset_path, config.stopwords_path, config.tfidf_model_save_path)
    X_train, X_test, y_train, y_test, seq_length = get_data()
    model = LrModel(config, seq_length)
    train(X_train, X_test, y_train, y_test)

预测

predict.py:
TensorFlow通过tf.train.Saver类实现神经网络模型的保存和提取

saver = tf.train.Saver() // 先要创建一个Saver对象
saver.save(sess=sess, save_path=config.lr_save_path) // # saver.save,保存模型

saver.restore(sess=session, save_path=config.lr_save_path) //
saver.restore, 模型提取,重载模型的参数,继续训练或用于测试数据

用模型把一条数据进行预测:

import tensorflow as tf
#from sklearn.externals import joblib
import joblib
import jieba
from config.lr_config import LrConfig
from lr_model import LrModel

def pre_data(data, config):
    """分词去停用词"""
    stopwords = list()
    text_list = list()
    with open(config.stopwords_path, 'r', encoding='utf-8') as f:
        for word in f.readlines():
            stopwords.append(word[:-1])
    seg_text = jieba.cut(data)
    text = [word for word in seg_text if word not in stopwords]
    text_list.append(' '.join(text))
    return text_list


def read_categories():
    """读取类别"""
    with open(config.categories_save_path, 'r', encoding='utf-8') as f:
        categories = f.readlines()
    return categories[0].split('|')


def predict_line(data, categories):
    """预测结果"""
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    
    # 先要创建一个Saver对象
    saver = tf.train.Saver()
    # saver.restore, 模型提取,重载模型的参数,继续训练或用于测试数据
    saver.restore(sess=session, save_path=config.lr_save_path)
    
    y_pred_cls = session.run(model.y_pred_cls, feed_dict={model.x: data})
    return categories[y_pred_cls[0]]


if __name__ == "__main__":
    data = "三星ST550以全新的拍摄方式超越了以往任何一款数码相机"
    config = LrConfig()
    line = pre_data(data, config)
    tfidf_model = joblib.load(config.tfidf_model_save_path)
    X_test = tfidf_model.transform(line).toarray()
    model = LrModel(config, len(X_test[0]))
    categories = read_categories()
    print(predict_line(X_test, categories))

结果

主函数运行结果:
在这里插入图片描述
总共训练了13640次,时间56分钟,最好的结果为
在这里插入图片描述

Iter: 12640, Train Loss: 0.25, Train Acc: 95.61%, Val Loss: 0.26, Val Acc: 95.63%, Time: 0:52:48 *
训练集损失:0.25,训练集准确度:98.61%
验证集损失:0.26,验证集准确度:95.63%

测试函数运行结果:
测试数据为:

data = “三星ST550以全新的拍摄方式超越了以往任何一款数码相机”

在这里插入图片描述

参考代码

https://github.com/Alic-yuan/nlp-beginner-finish/tree/master/task1

  • 1
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值