词频-逆文档频率(TF-IDF)算法

最新推荐文章于 2024-05-10 08:30:00 发布

鸡康

最新推荐文章于 2024-05-10 08:30:00 发布

阅读量715

点赞数

分类专栏： py机器学习文章标签： python

本文链接：https://blog.csdn.net/qq_45451226/article/details/115508144

版权

py机器学习专栏收录该内容

6 篇文章 0 订阅

订阅专栏

# -*- coding=utf-8 -*-

import numpy as np
import pandas as pd

docA = "The cat sat on my bed"
docB = "The dog sat on my knees"

bowA = docA.split(" ") #['The', 'cat', 'sat', 'on', 'my', 'bed']
bowB = docB.split(" ")
# print(bowA)

# 构建词库
wordSet = set(bowA).union(set(bowB))  #{'cat', 'sat', 'dog', 'knees', 'on', 'The', 'bed', 'my'}
# print(wordSet)

# 用统计字典来保存词出现的次数
wordDictA = dict.fromkeys(wordSet, 0)  #{'my': 0, 'on': 0, 'cat': 0, 'The': 0, 'knees': 0, 'sat': 0, 'dog': 0, 'bed': 0}
wordDictB = dict.fromkeys(wordSet, 0)
# print(wordDictA)

# 遍历文档，统计词数
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1

pd.DataFrame([wordDictA, wordDictB])    #   on  cat  bed  sat  The  dog  knees  my
#                                       0   1    1    1    1    1    0      0   1
#                                       1   1    0    0    1    1    1      1   1

def computeTF(wordDict, bow):
    # 用一个字典对象记录tf，把所有的词对应在bow文档里的tf都算出来
    tfDict = {}
    nbowCount = len(bow)

    for word, count in wordDict.items():
        tfDict[word] = count / nbowCount
    return tfDict


tfA = computeTF(wordDictA, bowA) #{'my': 0, 'on': 0, 'cat': 0, 'The': 0, 'knees': 0, 'sat': 0, 'dog': 0, 'bed': 0} #['The', 'cat', 'sat', 'on', 'my', 'bed']
tfB = computeTF(wordDictB, bowB)
#{'my': 0.16666666666666666, 'knees': 0.0, 'dog': 0.0, 'The': 0.16666666666666666, 'cat': 0.16666666666666666, 'on': 0.16666666666666666, 'bed': 0.16666666666666666, 'sat': 0.16666666666666666}
#{'knees': 0.16666666666666666, 'sat': 0.16666666666666666, 'The': 0.16666666666666666, 'bed': 0.0, 'my': 0.16666666666666666, 'on': 0.16666666666666666, 'dog': 0.16666666666666666, 'cat': 0.0}

def computeIDF(wordDictList):
    # 用一个字典对象保存idf结果，每个词作为key，初始值为0
    idfDict = dict.fromkeys(wordDictList[0], 0)
    N = len(wordDictList)
    import math

    for wordDict in wordDictList:
        # 遍历字典中的每个词汇，统计Ni
        for word, count in wordDict.items():
            if count > 0:
                # 先把Ni增加1，存入到idfDict
                idfDict[word] += 1

    # 已经得到所有词汇i对应的Ni，现在根据公式把它替换成为idf值
    for word, ni in idfDict.items():
        idfDict[word] = math.log10((N + 1) / (ni + 1))

    return idfDict


idfs = computeIDF([wordDictA, wordDictB])
#{'knees': 0.17609125905568124, 'cat': 0.17609125905568124, 'on': 0.0, 'bed': 0.17609125905568124, 'The': 0.0, 'dog': 0.17609125905568124, 'my': 0.0, 'sat': 0.0}

def computeTFIDF( tf, idfs ):
    tfidf = {}
    for word, tfval in tf.items():
        tfidf[word] = tfval * idfs[word]
    return tfidf

tfidfA = computeTFIDF( tfA, idfs )
tfidfB = computeTFIDF( tfB, idfs )

pd.DataFrame([tfidfA, tfidfB])#         cat       bed  sat  The   on   my     knees       dog
#                               0  0.029349  0.029349  0.0  0.0  0.0  0.0  0.000000  0.000000
#                               1  0.000000  0.000000  0.0  0.0  0.0  0.0  0.029349  0.029349

鸡康

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
词频-逆文档频率(TF-IDF)算法

# -*- coding=utf-8 -*-import numpy as npimport pandas as pddocA = "The cat sat on my bed"docB = "The dog sat on my knees"bowA = docA.split(" ") #['The', 'cat', 'sat', 'on', 'my', 'bed']bowB = docB.split(" ")# print(bowA)# 构建词库wordSet = set(bow
复制链接

扫一扫