- 引入依赖
import numpy as np
import pandas as pd
- 定义数据和预处理
docA = "The cat sat on my bed"
docB = "The dog sat on my knees"
bowA = docA.split(" ")
bowB = docB.split(" ")
bowA
# 构建词库
wordSet = set(bowA).union(set(bowB))
wordSet
Out[3]:
{
'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}
- 进行词数统计
# 用统计字典来保存词出现的次数
wordDictA = dict.fromkeys( wordSet, 0 )
wordDictB