先把代码贴上,注释后期更新
书上例4.7的网址已经失效,建议换新的网址
from numpy import *
#创建数据集
def loadDataSet():
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]
#返回单词列表和所述类别
return postingList,classVec
def createVocabList(dataSet):# 将所有文章中的词汇取并集汇总
vocabSet=set([]) # 定义一个set(set存储的内容无重复)
for document in dataSet:# 遍历导入的dataset数据,将所有词汇取并集存储至vocabSet中
vocabSet=vocabSet | set(document)# | 符号为取并集,即获得所有文章的词汇表
return list(vocabSet)
#该函数输入参数为词汇表及某篇文章,输出为文档向量,向量每个元素为1或0,分别表示词汇表中的单词在输入文档中是否出现;
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)#构建一个0向量;
for word in inputSet:
if word in vocabList:# 遍历词汇表,如果文档中出现了词汇表中单词,则将输出的文档向量中对应值设为1,旨在计算各词汇出现的次数;
returnVec[vocabList.index(word)]=1#因为上一段代码里,给的文章例子里的单词都是不重复的,如果有重