文本分类：1、传统Machine Learning方法

最新推荐文章于 2021-06-23 19:05:57 发布

杏树

最新推荐文章于 2021-06-23 19:05:57 发布

阅读量360

点赞数

分类专栏：自然语言理解NLU 文章标签：文本分类

本文链接：https://blog.csdn.net/weixin_44560727/article/details/102647772

版权

自然语言理解NLU 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

数据清洗与预处理：分词、去除停用词和低频词
特征工程 + 分类器：
特征工程：将文本转换成固定纬度的向量
分类器：LR、SVM、GBDT 等

一、特征工程（无监督学习）

词袋模型(bag of words, uingram)：未考虑单词顺序和相关性，不适合短文本分类
bag of n-grams(bi-gram, tri-gram)：考虑局部单词顺序，但总组合纬度较大、参数过多
TF—IDF(词频—逆文档频率)：字词的重要性随着它在文件中出现的次数成正比增加，但同时会随着它在语料库中出现的频率成反比下降。(类似于关键词)
word2vec(常用)：将单词映射到向量空间，以计算相似度。 fasttext，将所有向量平均以得到文本的向量表达，再输入分类器、用softmax计算各类别的概率。
LDA(降维)：计算文档的主题分布，适用于长文本分类
LSI：基于矩阵分解的思想，计算文本的语义向量 (没用过)。

TF—IDF示例

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# 计算tf-idf
vectorizer = CountVectorizer(min_df=1e-5) # drop df < 1e-5,去低频词
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus_set))
words = vectorizer.get_feature_names()
print "how many words: {0}".format(len(words))
print "tf-idf shape: ({0},{1})".format(tfidf.shape[0], tfidf.shape[1])

"""
how many words: 379000
tf-idf shape: (65000,379000)
"""

word2vec示例

# coding:utf-8
import time
from gensim.models import Word2Vec

class MySentences(object):
   def __init__(self, file_path):
       self.file_path = file_path
   def __iter__(self):
       with open(self.file_path) as file:
           for line in file:
               line = line.strip()
               if not line:
                   continue
               yield line.split()
               
def train_embedding(in_path, model_path, write_flag, out_path, vec_size=100, min_count=5, iter=5):
   """
   使用分词后的用户评价训练词向量
   :param in_path: 用户评价文件路径
   :param out_path: 词向量模板保存路径
   :param vec_size: 词向量维度
   :return:
   """
   start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
   print('词向量训练开始时间：', start_time)
   sentences = MySentences(in_path)
   model = Word2Vec(sentences, size=vec_size, min_count=min_count, iter=iter)
   model.save(model_path)
   if write_flag:
       words = list(model.wv.vocab.keys())
       with open(out_path, 'w') as file:
           for word in words:
               vec = model[word]
               file.write('{}\t{}\n'.format(word, '\t'.join(map(str, vec))))
   end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
   print('词向量model保存到：', model_path)
   print('词向量训练结束时间：', end_time)

def find_embedding(model_path):
   model = Word2Vec.load(model_path)
   words = list(model.wv.vocab.keys())
   try:
       print(model['UNK'])
   except Exception as e:
       pass

if __name__ == '__main__':
   data_path = '/Users/crz/Downloads/用户评价/分词后用户评价/all_fenci_filter_dp_用户评价_20180322-31.txt'
   model_path = '/Users/crz/Downloads/用户评价/分词后用户评价/word2vec_model/word2vec.model'
   write_flag = True
   out_path = '/Users/crz/Downloads/用户评价/分词后用户评价/word2vec.txt'

   # train_embedding(data_path, model_path, write_flag, out_path, min_count=1)
   find_embedding()

二、分类器
logistic regression分类器

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix

# LogisticRegression classiy model
lr_model = LogisticRegression()
lr_model.fit(train_set, train_label)
print "val mean accuracy: {0}".format(lr_model.score(val_set, val_label))
y_pred = lr_model.predict(test_set)
print classification_report(test_label, y_pred)

Random Forest 分类器

from sklearn.ensemble import RandomForestClassifier    

rf_model = RandomForestClassifier(n_estimators=200, random_state=1080)
rf_model.fit(train_set, train_label)
print "val mean accuracy: {0}".format(rf_model.score(val_set, val_label))
y_pred = rf_model.predict(test_set)
print classification_report(test_label, y_pred)