#词转化为词向量形式,便于计算defsetOfWord2Vec(vocablist, inputSet):
returnVec = [0] * len(vocablist)
for word in inputSet:
if word in vocablist:
# list 的index方法()。第一个匹配项。类似字符串的find方法
returnVec[vocablist.index(word)] = 1else:
print("the word: {0} is not in vocablist".format(word))
return returnVec
['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1
deftextParse(bigString):import re
if bigString != None:
#需要非空的pattern,所以我把*改成了+
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
defspamTest():
docList = []
classList = []
fullText = []
for i in range(26):
#read()读取整个文件,通常转化为字符串#print(i)#,用来检测哪个文件出错了#若用'rb'读取,需要'gbk'解码#我在目录下添加了0.txt,为了符合range(26)的需求
wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
#ham\6.txt有错误,中间有个are是乱码。
wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
#改成list()形式,python3返回range迭代器
traingSet = list(range(50))
testSet = []
for i in range(10):
randIndex = int(np.random.uniform(0, len(traingSet)))
#testSet只要index
testSet.append(traingSet[randIndex])
del(traingSet[randIndex])
traingMat = []
traingClasses = []
for docIndex in traingSet:
traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
traingClasses.append(classList[docIndex])
p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
errorCount = 0.0for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
errorCount += 1
print('the error rate is : ', float(errorCount) / len(testSet))
spamTest()
the error rate is : 0.1
'ab'.encode('gbk')
'\0xab'.encode('gbk')
b'\x00xab'
#改动前ham\6.txt为置信区间0.73, windows-1225编码?中间有乱码导致错误import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())