- 数据清洗与预处理:分词、去除停用词 和 低频词
- 特征工程 + 分类器:
特征工程:将文本转换成 固定纬度的向量
分类器:LR、SVM、GBDT 等
一、特征工程(无监督学习)
- 词袋模型(bag of words, uingram):未考虑单词顺序和相关性,不适合短文本分类
- bag of n-grams(bi-gram, tri-gram):考虑局部单词顺序,但总组合纬度较大、参数过多
- TF—IDF(词频—逆文档频率):字词的重要性随着它在文件中出现的次数成正比增加,但同时会随着它在语料库中出现的频率成反比下降。(类似于关键词)
- word2vec(常用):将单词映射到向量空间,以计算相似度。 fasttext,将所有向量平均以得到文本的向量表达,再输入分类器、用softmax计算各类别的概率。
- LDA(降维):计算文档的主题分布,适用于长文本分类
- LSI:基于矩阵分解的思想,计算文本的语义向量 (没用过)。
TF—IDF示例
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# 计算tf-idf
vectorizer = CountVectorizer(min_df=1e-5) # drop df < 1e-5,去低频词
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus_set))
words = vectorizer.get_feature_names()
print "how many words: {0}".format(len(words))
print "tf-idf shape: ({0},{1})".format(tfidf.shape[0], tfidf.shape[1])
"""
how many words: 379000
tf-idf shape: (65000,379000)
"""
word2vec示例
# coding:utf-8
import time
from gensim.models import Word2Vec
class MySentences(object):
def __init__(self, file_path):
self.file_path = file_path
def __iter__(self):
with open(self.file_path) as file:
for line in file:
line = line.strip()
if not line:
continue
yield line.split()
def train_embedding(in_path, model_path, write_flag, out_path, vec_size=100, min_count=5, iter=5):
"""
使用分词后的用户评价训练词向量
:param in_path: 用户评价文件路径
:param out_path: 词向量模板保存路径
:param vec_size: 词向量维度
:return:
"""
start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print('词向量训练开始时间:', start_time)
sentences = MySentences(in_path)
model = Word2Vec(sentences, size=vec_size, min_count=min_count, iter=iter)
model.save(model_path)
if write_flag:
words = list(model.wv.vocab.keys())
with open(out_path, 'w') as file:
for word in words:
vec = model[word]
file.write('{}\t{}\n'.format(word, '\t'.join(map(str, vec))))
end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
print('词向量model保存到:', model_path)
print('词向量训练结束时间:', end_time)
def find_embedding(model_path):
model = Word2Vec.load(model_path)
words = list(model.wv.vocab.keys())
try:
print(model['UNK'])
except Exception as e:
pass
if __name__ == '__main__':
data_path = '/Users/crz/Downloads/用户评价/分词后用户评价/all_fenci_filter_dp_用户评价_20180322-31.txt'
model_path = '/Users/crz/Downloads/用户评价/分词后用户评价/word2vec_model/word2vec.model'
write_flag = True
out_path = '/Users/crz/Downloads/用户评价/分词后用户评价/word2vec.txt'
# train_embedding(data_path, model_path, write_flag, out_path, min_count=1)
find_embedding()
二、分类器
logistic regression分类器
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# LogisticRegression classiy model
lr_model = LogisticRegression()
lr_model.fit(train_set, train_label)
print "val mean accuracy: {0}".format(lr_model.score(val_set, val_label))
y_pred = lr_model.predict(test_set)
print classification_report(test_label, y_pred)
Random Forest 分类器
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=1080)
rf_model.fit(train_set, train_label)
print "val mean accuracy: {0}".format(rf_model.score(val_set, val_label))
y_pred = rf_model.predict(test_set)
print classification_report(test_label, y_pred)