推荐系统--------------TF-IDFpython代码实现

import  numpy as np
import pandas as pd

str01 = "the hello my union left spark flink"
str02 = "hive hadoop spark my keep my hbase the is datatabase table partition"
str_list01 = str01.split(" ")
str_list02 = str02.split(" ")
# 构建词库
wordset = set(str_list01).union(set(str_list02))
# 统计字典来保护词出现的次数
wordDict01 =dict.fromkeys(wordset,0)
wordDict02 =dict.fromkeys(wordset,0)
#遍历文档,统计词数
for str in str_list01:
    wordDict01[str] += 1
for str in str_list02:
    wordDict02[str] += 1
print(wordDict01)
print(wordDict02)
print(pd.DataFrame([wordDict01,wordDict02]))

# 计算词频
def count_TF(worddict,bow):
# 用一个字典对象记录tf,把所有的词对应的bow文档里面的tf都算出来
    tfdict = {}
    number_bow = len(bow)
    for word,count in worddict.items():
        tfdict[word] = count / number_bow
    return tfdict
print(count_TF(wordDict01,str_list01))
print(count_TF(wordDict02,str_list02))

# 计算逆文档频率IDF
def count_IDF(worddict_list):
    # 用一个字典对象保存idf结果,每个词作为key,初始值位0
    idfidct = dict.fromkeys(worddict_list[0],0)
    number_count = len(worddict_list)
    import math # 这个包是为了用数学中的log函数
    for worddict in worddict_list:
        # 遍历字典中的每一个词汇,统计count
        for word,count in worddict.items():
            if count > 0:
                # 先把count增加1,存入到idfdict
                idfidct[word] += 1
    # 已经得到所有词汇i对应的count,卸载根据公式把它替换成idf的值
    for word,count in idfidct.items():
        idfidct[word] = math.log10((number_count + 1) / (count + 1))
    return idfidct

print(count_IDF([wordDict01,wordDict02]))

# 计算TF-IDF
def count_TFIDF(tf,idfs):
    tfidf = {}
    for word,count in tf.items():
        tfidf[word] = count * idfs[word]
    return tfidf
print(count_TFIDF(count_TF(wordDict01,str_list01),count_IDF([wordDict01,wordDict02])))
print(count_TFIDF(count_TF(wordDict02,str_list01),count_IDF([wordDict01,wordDict02])))
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值