1 from numpy import *
2
3 #创建一些实验样本。该函数返回的第一个变量是进行词条切分后的文档集合,
4 #该函数返回的第二个变量是一个类别标签的集合
5 defloadDataSet():6 postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],7 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],8 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],9 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],10 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],11 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]12 classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
13 returnpostingList,classVec14
15 #创建一个包含所有文档中出现的不重复词的列表
16 defcreateVocabList(dataSet):17 #创建一个空集
18 vocabSet = set([]) #create empty set
19 for document indataSet:20 #创建两个集合的并集
21 vocabSet = vocabSet | set(document) #union of the two sets
22 returnlist(vocabSet)23
24 #该函数的输入参数为词汇表及其某个文档,输出的是文档向量,向量的每一元素为1或0,
25 #分别表示词汇表中的单词在输入文档中是否出现。
26 #函数首先创建一个和词汇表等长的向量,并将其元素都设置为0.接着,遍历文档中的所有单词,
27 #如果出现了词汇表中的单词,则将输出的文档向量中对应值设为1.一切顺利的话,就不需要
28 #检查某个词是否还在vocabList中,后边可能会用到这一操作
29 defsetOfWords2Vec(vocabList, inputSet):30 #创建一个维度都为0的向量
31 returnVec = [0]*len(vocabList)32 for word ininputSet:33 if word invocabList:34 returnVec[vocabList.index(word)] = 1
35 else: print ("the word: %s is not in my Vocabulary!" %word)36 returnreturnVec37 '''
38 该函数的伪代码如下:39 计算每个类别中的文档数目40 对每篇训练文档:41 对每个类别:42 如果词条出现文档中则增加该词条的计数值43 增加所有词条的计数值44 对每个类别:45 对每个词条:46 将该词条的数目除以总词条数目得到条件概率47 返回每个类别的条件概率48 '''
49
50 #51 deftrainNB0(trainMatrix,trainCategory):52 numTrainDocs =len(trainMatrix)53 numWords =len(trainMatrix[0])54 #初始化概率
55 pAbusive = sum(trainCategory)/float(numTrainDocs)56 p0Num = ones(numWords); p1Num = ones(numWords) #change to ones()
57 p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
58 for i inrange(numTrainDocs):59 #向量相加
60 if trainCategory[i] == 1:61 p1Num +=trainMatrix[i]62 p1Denom +=sum(trainMatrix[i])63 else:64 p0Num +=trainMatrix[i]65 p0Denom +=sum(trainMatrix[i])66 #对每个元素做除法
67 p1Vect = log(p1Num/p1Denom) #change to log()
68 p0Vect = log(p0Num/p0Denom) #change to log()
69 returnp0Vect,p1Vect,pAbusive70
71 defclassifyNB(vec2Classify, p0Vec, p1Vec, pClass1):72 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
73 p0 = sum(vec2Classify * p0Vec) + log(1.0 -pClass1)74 if p1 >p0:75 return 1
76 else:77 return078
79 defbagOfWords2VecMN(vocabList, inputSet):80 returnVec = [0]*len(vocabList)81 for word ininputSet:82 if word invocabList:83 returnVec[vocabList.index(word)] += 1
84 returnreturnVec85
86 deftestingNB():87 listOPosts,listClasses =loadDataSet()88 myVocabList =createVocabList(listOPosts)89 trainMat=[]90 for postinDoc inlistOPosts:91 trainMat.append(setOfWords2Vec(myVocabList, postinDoc))92 p0V,p1V,pAb =trainNB0(array(trainMat),array(listClasses))93 testEntry = ['love', 'my', 'dalmation']94 thisDoc =array(setOfWords2Vec(myVocabList, testEntry))95 print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))96 testEntry = ['stupid', 'garbage']97 thisDoc =array(setOfWords2Vec(myVocabList, testEntry))98 print (testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))99 #如果一个词在文档中出现不止依次,这可能意味着包含该词是否出现的文档所不能表达的某种信息,
100 #这种方法被称为词袋模型。
101 ##
102
103 mySent='This book is the best book on Python or M.L. I have ever laid eyes upon.'
104 A=mySent.split()105 print(A)