python 基础:
中间还有pickle二进制读取文件部分的error这个可以参见:机器学习实战初识决策树(ID3)算法理解其python代码(二)的第四部分
append: Appends object at end.:
x = [1, 2, 3]
x.append([4, 5])
print (x)
[1, 2, 3, [4, 5]]
extend: Extends list by appending elements from the iterable.:
x = [1, 2, 3]
x.extend([4, 5])
print (x)
[1, 2, 3, 4, 5]
测试算法:
import random
import re
from numpy import array
import LoadData
import bayes
def textParse(bigString):#接收大字符串,解析处理后返回字符串列表(去掉少于两个字符的字符串,并将所有字符串转换为小写)
listOfTokens = re.compile('\\W*')
listOfTokens = listOfTokens.split(bigString)#compile()split(r'\W*',bigString)#正则表达式re模块,详见之前的文章
return [tok.lower() for tok in listOfTokens if len(tok)>0]#列表解析
'''这里出现错误最多的也还是Py2.x和Py3.x的不同导致的问题'''
def spamTest():
docList = []
classList = []
fullText = []
#读取25*2个文本
for i in range(1,26):
wordList = textParse(open('email/spam/%d.txt' % i,'rb').read().decode('GBK','ignore') )#1,UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence
#加上后面的后綴,因为有可能文件中存在类似“�”非法字符。
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email/ham/%d.txt' % i,'rb').read().decode('GBK','ignore') )#UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence
#这里还是Pickle的二进制问题,所以要加上‘rb’,其他nicodeDecodeError同上
#注意append和extend的区别
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = LoadData.createVocabList(docList)#得到参考用的词典
#随机构建训练集
trainingSet = list(range(50))
testSet = []
for i in range(10):#得到随机测试集
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])#TypeError: 'range' object doesn't support item deletion,因为是python3中range不返回数组对象,而是返回range对象,所以trainingSet = list(range(50))而不是range(50)
trainMat = [];trainClasses = []
for docIndex in trainingSet:
trainMat.append(LoadData.setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = bayes.trainNB0(array(trainMat),array(trainClasses))#计算相应的概率
errorCount = 0
for docIndex in testSet:
wordVector = LoadData.setOfWords2Vec(vocabList,docList[docIndex])
if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:#判断文本的类别
errorCount+=1
print('the error rate is :',float(errorCount)/len(testSet))