import numpy as np
import pandas as pd
str01 = "the hello my union left spark flink"
str02 = "hive hadoop spark my keep my hbase the is datatabase table partition"
str_list01 = str01.split(" ")
str_list02 = str02.split(" ")
# 构建词库
wordset = set(str_list01).union(set(str_list02))
# 统计字典来保护词出现的次数
wordDict01 =dict.fromkeys(wordset,0)
wordDict02 =dict.fromkeys(wordset,0)
#遍历文档,统计词数
for str in str_list01:
wordDict01[str] += 1
for str in str_list02:
wordDict02[str] += 1
print(wordDict01)
print(wordDict02)
print(pd.DataFrame([wordDict01,wordDict02]))
# 计算词频
def count_TF(worddict,bow):
# 用一个字典对象记录tf,把所有的词对应的bow文档里面的tf都算出来
tfdict = {}
number_bow = len(bow)
for word,count in worddict.items():
tfdict[word] = count / number_bow
return tfdict
print(count_TF(wordDict01,str_list01))
print(count_TF(wordDict02,str_list02))
# 计算逆文档频率IDF
def count_IDF(worddict_list):
# 用一个字典对象保存idf结果,每个词作为key,初始值位0
idfidct = dict.fromkeys(worddict_list[0],0)
number_count = len(worddict_list)
import math # 这个包是为了用数学中的log函数
for worddict in worddict_list:
# 遍历字典中的每一个词汇,统计count
for word,count in worddict.items():
if count > 0:
# 先把count增加1,存入到idfdict
idfidct[word] += 1
# 已经得到所有词汇i对应的count,卸载根据公式把它替换成idf的值
for word,count in idfidct.items():
idfidct[word] = math.log10((number_count + 1) / (count + 1))
return idfidct
print(count_IDF([wordDict01,wordDict02]))
# 计算TF-IDF
def count_TFIDF(tf,idfs):
tfidf = {}
for word,count in tf.items():
tfidf[word] = count * idfs[word]
return tfidf
print(count_TFIDF(count_TF(wordDict01,str_list01),count_IDF([wordDict01,wordDict02])))
print(count_TFIDF(count_TF(wordDict02,str_list01),count_IDF([wordDict01,wordDict02])))
推荐系统--------------TF-IDFpython代码实现
最新推荐文章于 2023-05-07 15:59:19 发布