之前学过tf-idf算法,但没有自己写过,最近正好拿到了一个数据集,就自己写了一个,因为每个人的需求不同,写的代码也会有一些差异,以下仅供参考,后面会公布一些其他自己实现的算法
# -*- coding: utf-8 -*-
from collections import defaultdict
import math
import operator
from nltk.tokenize import word_tokenize
import pandas as pd
def loadDataSet(data):
sentence = list(data['Tweet'])
classVec = list(data['label']) #classVec - 类别标签向量
dataset = [word_tokenize(t) for t in sentence]
# 去停用词
for i in range(len(dataset)):
for j in dataset[i]:
if j in stopword:
dataset[i].remove(j)
# 去除标点符号
list1 = [',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*']
cutdata = []
for i in range(len(dataset)):
cutdata.append([])
for j in dataset[i]:
if j not in list1:
cutdata[i].append(j)
return cutdata, classVec
def feature_select(list_words):
# 计算每个词的TF值
doc_frequency = defaultdict(int) # 初始为0
tf_1 = []
for word_list in list_words:
wordsum = defaultdict(int)
for i in word_list:
wordsum[i] += 1
tf_1.append(wordsum)
for wordsum in tf_1:
n = sum(wordsum.values())
for i in wordsum:
wordsum[i] = wordsum[i] / n #tf值
# 计算每个词的IDF值
doc_frequency = defaultdict(int)
for word_list in list_words:
for i in word_list:
doc_frequency[i] += 1
idf_2 = []
for i in range(len(tf_1)):
idf_2.append(dict.fromkeys(tf_1[i], 0)) #创建一个和tf_1格式一样,但值都为0
doc_num = len(list_words)
word_idf = {} # 存储每个词的idf值
word_doc = defaultdict(int) # 存储包含该词的文档数
for i in doc_frequency:
for j in list_words:
if i in j:
word_doc[i] += 1
for i in doc_frequency:
word_idf[i] = math.log(doc_num / (word_doc[i] + 1))
for i in range(len(idf_2)):
for j in idf_2[i]:
idf_2[i][j] = word_idf[j]
# 计算每个词的TF*IDF的值
tf_idf = []
for i in range(len(tf_1)):
tf_idf.append(dict.fromkeys(tf_1[i], 0))
for i in range(len(tf_idf)):
for j in tf_idf[i]:
tf_idf[i][j] = tf_1[i][j] * idf_2[i][j]
# 对字典按值由大到小排序
final = []
for i in range(len(tf_idf)):
final.append(max(tf_idf[i].items(), key=lambda x: x[1]))
return final #特征选择词字典
if __name__ == '__main__':
word = pd.read_csv('stop.txt', header=None)
stopword = list(word[0]) # 英文停用词表
data = pd.read_csv(r'D:\chengxu\Anconda3\Stance\input_data\train_data.csv')
data_list, label_list = loadDataSet(data) # 加载数据
features = feature_select(data_list) # 所有词的TF-IDF值
print(features)
print(len(features))