gensim返回的tfidf的格式是长这样的
[[(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)],
[(0, 0.10212329019650272), (2, 0.10212329019650272), (4, 0.10212329019650272), (5, 0.9842319344536239)],
[(6, 0.5773502691896258), (7, 0.5773502691896258), (8, 0.5773502691896258)], [(0, 0.33699829595119235), (1, 0.8119707171924228), (2, 0.33699829595119235), (4, 0.33699829595119235)]]
有时候需要将tfidf作为权重,这时候需要做一下转换,但是我不想用sklearn做,虽然可以直接给出结果,于是有了如下的函数,做个记录,没考虑什么算法效率
'''
输入:
wordList=[ ['this', 'is', 'the', 'first', 'document'],
['this', 'is', 'the', 'second', 'second', 'document'],
['and', 'the', 'third', 'one'],
['is', 'this', 'the', 'first', 'document']]
输出:
[[0.33699829595119235, 0.33699829595119235, 0.0, 0.8119707171924228, 0.33699829595119235],
[0.10212329019650272, 0.10212329019650272, 0.0, 0.9842319344536239, 0.9842319344536239, 0.10212329019650272],
[0.5773502691896258, 0.0, 0.5773502691896258, 0.5773502691896258], [0.33699829595119235, 0.33699829595119235, 0.0, 0.8119707171924228, 0.33699829595119235]]
'''
def TFIDF_change(wordList):
frequency = defaultdict(int)
for text in wordList:
for token in text:
frequency[token] += 1
# 选择频率大于1的词
texts = [[token for token in text if frequency[token] > 1] for text in wordList]
print('-----------2----------')
print(texts)
dictionary = corpora.Dictionary(wordList)
print('-----------3----------')
print(dictionary.token2id)
# print(dictionary)
# dictionary.save('ths_dict.dict')
new_corpus = [dictionary.doc2bow(text) for text in wordList]
tfidf = models.TfidfModel(new_corpus)
tfidf_vec = []
for i in range(len(wordList)):
string_bow = dictionary.doc2bow(wordList[i])
string_tfidf = tfidf[string_bow]
tfidf_vec.append(string_tfidf)
words_vec = []
for j in range(len(wordList)):
tf_vec_change = []
for word in wordList[j]:
word_id = dictionary.token2id[word]
flag = False
for vec in tfidf_vec[j]:
if word_id == vec[0]:
tf_vec_change.append(vec[1])
flag = True
break
if flag is False:
tf_vec_change.append(0.)
words_vec.append(tf_vec_change)
print(tfidf_vec)
print(words_vec)
return words_vec