"""
词表到向量的转换函数
"""
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWord2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word: %s is not in my Vocabulary!" % word)
return returnVec
import importlib
importlib.reload(pusu_bayes)
Out[14]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
listOPosts, listClasses = pusu_bayes.loadDataSet()
myVocabList = pusu_bayes.createVocabList(listOPosts)
pusu_bayes.setOfWord2Vec(myVocabList, listOPosts[0])
the word: please is not in my Vocabulary!
Out[17]:
[1,
1,
0,
1,
0,
0,
0,
0,
1,
0,
1,
1,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0]
myVocabList
Out[18]:
['has',
'my',
'to',
'dog',
'how',
'maybe',
'love',
'park',
'problems',
'stop',
'please',
'flea',
'take',
'is',
'mr',
'not',
'garbage',
'food',
'help',
'worthless',
'buying',
'I',
'so',
'him',
'posting',
'ate',
'licks',
'dalmation',
'steak',
'stupid',
'cute',
'quit']
pusu_bayes.setOfWord2Vec(myVocabList, listOPosts[3])
Out[20]:
[0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
1,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0]
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = zeros(numWords); p1Num = zeros(numWords)
p0Denom = 0.0; p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
print(p1Num, p0Num, p1Denom, p0Denom)
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect, p1Vect, pAbusive
importlib.reload(pusu_bayes)
Out[36]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
listOPosts, listClasses = pusu_bayes.loadDataSet()
myVocabList = pusu_bayes.createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
...: trainMat.append(pusu_bayes.setOfWord2Vec(myVocabList, postinDoc))
...:
p0V, p1V, pAb = pusu_bayes.trainNB0(trainMat, listClasses)
[0. 0. 1. 2. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 2. 1. 0. 0. 1.
1. 0. 0. 0. 0. 3. 0. 1.] [1. 3. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 2.
0. 1. 1. 1. 1. 0. 1. 0.] 19.0 24.0
pAb
Out[42]: 0.5
p0V
Out[43]:
array([0.04166667, 0.125 , 0.04166667, 0.04166667, 0.04166667,
0. , 0.04166667, 0. , 0.04166667, 0.04166667,
0.04166667, 0.04166667, 0. , 0.04166667, 0.04166667,
0. , 0. , 0. , 0.04166667, 0. ,
0. , 0.04166667, 0.04166667, 0.08333333, 0. ,
0.04166667, 0.04166667, 0.04166667, 0.04166667, 0. ,
0.04166667, 0. ])
p1V
Out[44]:
array([0. , 0. , 0.05263158, 0.10526316, 0. ,
0.05263158, 0. , 0.05263158, 0. , 0.05263158,
0. , 0. , 0.05263158, 0. , 0. ,
0.05263158, 0.05263158, 0.05263158, 0. , 0.10526316,
0.05263158, 0. , 0. , 0.05263158, 0.05263158,
0. , 0. , 0. , 0. , 0.15789474,
0. , 0.05263158])
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + log(pClass1)
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses))
testEntry1 = ['love', 'my', 'dalmation']
thisDoc1 = array(setOfWord2Vec(myVocabList, testEntry1))
print(testEntry1, 'classified as:', classifyNB(thisDoc1, p0V, p1V, pAb))
testEntry2 = ['stupid', 'garbage']
thisDoc2 = array(setOfWord2Vec(myVocabList, testEntry2))
print(testEntry2, 'classified as:', classifyNB(thisDoc2, p0V, p1V, pAb))
importlib.reload(pusu_bayes)
Out[63]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
pusu_bayes.testingNB()
['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1
"""
文件解析及完整的垃圾邮件测试函数
"""
def textParse(bigString):
import re
listOfTokens = re.split(r'\w+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []
classList = []
fullText = []
for i in range(1, 26):
wordList = textParse(open('email/spam/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = list(range(50))
testSet = []
for i in range(10):
rangeIndex = int(random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[rangeIndex])
del(trainingSet[rangeIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print('the error rate is:', float(errorCount/len(testSet)))
importlib.reload(pusu_bayes)
Out[89]: <module 'pusu_bayes' from 'C:\\Users\\xuning\\PycharmProjects\\machine learning\\bayes\\pusu_bayes.py'>
pusu_bayes.spamTest()
the error rate is: 0.1