corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
# 1、分词————————————————————————————
[输入]:
word_list = []
for i in range(len(corpus)):
word_list.append(corpus[i].split(' '))
print(word_list)
[输出]:
[['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']]
# 2、统计词频————————————————————————————
[输入]:
countlist = []
for i in range(len(word_list)):
count = Counter(word_list[i])
countlist.append(count)
countlist
[输出]:
[Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1}),
Counter({'document': 1, 'is': 1, 'second': 2, 'the': 1, 'this': 1}),
Counter({'and': 1, 'one': 1, 'the': 1, 'third': 1}),
Counter({'document': 1, 'first': 1, 'is': 1, 'the': 1, 'this': 1})]
# 3、定义计算tfidf公式的函数———————————————
# word可以通过count得到,count可以通过countlist得到
# count[word]可以得到每个单词的词频, sum(count.values())得到整个句子的单词总数
def tf(word, count):
return count[word] / sum(count.values())
# 统计的是含有该单词的句子数
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
# len(count_list)是指句子的总数,n_containing(word, count_list)是指含有该单词的句子的总数,加1是为了防止分母为0
def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))
# 将tf和idf相乘
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
# 4、计算每个单词的tfidf值————————————
[输入]:
import math
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
[输出]:
Top words in document 1
Word: first, TF-IDF: 0.05754
Word: this, TF-IDF: 0.0
Word: is, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.04463
Top words in document 2
Word: second, TF-IDF: 0.23105
Word: this, TF-IDF: 0.0
Word: is, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.03719
Top words in document 3
Word: and, TF-IDF: 0.17329
Word: third, TF-IDF: 0.17329
Word: one, TF-IDF: 0.17329
Word: the, TF-IDF: -0.05579
Top words in document 4
Word: first, TF-IDF: 0.05754
Word: is, TF-IDF: 0.0
Word: this, TF-IDF: 0.0
Word: document, TF-IDF: 0.0
Word: the, TF-IDF: -0.04463
1.文本数据特征工程
文本的特征工程,原始数据将被转换为特征向量,另外也会根据现有的数据创建新的特征。为了从数据集中选出重要的特征,有以下几种方式:
- 计数向量作为特征
- TF-IDF向量作为特征
- 单个词语级别
- 多个词语级别(N-Gram)
- 词性级别
- 词嵌入作为特征
- 基于文本/NLP的特征
- 主题模型作为特征
本文介绍TF-IDF向量作为特征的实现
2.TF-IDF
TF-IDF(Term Frequency & Inverse Documentation Frequency 词频-逆文档)算法是当前非常常用的一种文本特征的提取方法,在文本信息检索,语意抽取等自然语言处理(NLP)中广泛应用。
TF(Term Frequency):中文意思是词频,也就是在一段文本中出现的频率较高的词,在之前的预处理中需要去掉了英文中的停词(类似与to,is,are,the这些高频出现但是却没有真正的实际意义的词汇)所以这里我们往往可以认为出现频率越高的词汇会对整个文档有较大的影响。
IDF(Inverse Document Frequency):逆文档频率,首先我们回想一下停词,它们往往会在文档中非常高频的出现但是反而不能表达出文档的真实意思。那么同样的在不是停词的另外一些单词中,有些单词往往可以更加体现出文章的真实表达的意思,就像this thing made in china,and this thing is big。中thing只是个指代它既不能告诉你它是什么具体的东西也不能告诉你它的任何具体特征,但是big和china却可以很好的描述这句话说了什么,但是things的词频要比china和big都要大,这显然是有问题的。所以为了能够解决这么一个问题,我们需要对前面的TF进行修正,于是提出了逆文档频率,它的大小和一个词的常见程度是成反比的。
将TF和IDF相乘就会得到TF-IDF的算法:
TF(t)=(该词语在文档出现的次数)/(文档中词语的总数)
IDF(t)= log_e(文档总数/出现该词语的文档总数)
TF-IDF向量可以由不同级别的分词产生(单个词语,词性,多个词(n-grams))
- 词语级别TF-IDF:矩阵代表了每个词语在不同文档中的TF-IDF分数。
- N-gram级别TF-IDF: N-grams是多个词语在一起的组合,这个矩阵代表了N-grams的TF-IDF分数。
- 词性级别TF-IDF:矩阵代表了语料中多个词性的TF-IDF分数。
示例:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#词语级tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
# ngram 级tf-idf
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
#词性级tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)
3.达观杯实战
# -*- coding: utf-8 -*-
"""
@brief : 将原始数据数字化为tfidf特征,并将结果保存至本地
"""
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import time
t_start = time.time()
"""======================================================================================
1 数据预处理
"""
print("1 数据预处理")
df_train = pd.read_csv('D:/Race2018/DaGuan/new_data/train_set.csv')
df_test = pd.read_csv('D:/Race2018/DaGuan/new_data/test_set.csv')
df_train.drop(columns='article', inplace=True)
df_test.drop(columns='article', inplace=True)
f_all = pd.concat(objs=[df_train, df_test], axis=0, sort=True)
y_train = (df_train['class'] - 1).values
"""======================================================================================
2 特征工程
"""
print("2 特征工程")
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])
"""======================================================================================
3 保存至本地
"""
print("3 保存至本地")
data = (x_train, y_train, x_test)
fp = open('D:/Race2018/DaGuan/ml/feature/data_w_tfidf.pkl', 'wb')
pickle.dump(data, fp)
fp.close()
t_end = time.time()
print("已将原始数据数字化为tfidf特征,共耗时:{}min".format((t_end-t_start)/60))
参考文献