06_TF-IDF算法代码示例
TF-IDF算法代码示例
0.引入依赖
import numpy as np # 数值计算、矩阵运算、向量运算
import pandas as pd # 数值分析、科学计算
1.定义数据和预处理
# 定义文档
docA = 'The cat sat on my bed'
docB = 'The dog sat on my knees'
# 切割文档
bowA = docA.split(' ')
bowB = docB.split(' ')
# bowA # ['The', 'cat', 'sat', 'on', 'my', 'bed']
# bowB # ['The', 'dog', 'sat', 'on', 'my', 'knees']
# 构建词库
wordSet = set(bowA).union(set(bowB))
# wordSet # {'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}
2.进行词数统计
# 用字典来保存词出现的次数
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
wordDictA
wordDictB
# 遍历文档,统计词数
for word in bowA:
wordDictA[word] += 1