import pdb,jieba,string
#pdb.set_trace()
import os,sys
import numpy as np
#1 函数定义部分
def textParse2(bigString):
stop_f=open('d:/email/stopwords.txt',encoding='utf8')
stopwords=list()
for line in stop_f.readlines():
line=line.strip()
stopwords.append(line)
stop_f.close()
seg_list=jieba.lcut(bigString,cut_all=False)
outstr=[]
for i in seg_list:
if i not in stopwords and i not in string.punctuation and i not in [' ','\n']:
outstr.append(i)
return outstr
def createVocabList(dataSet,classList):
vocabSet1 = set([]) #create empty set
vocabSet2 = [set([])]*10
for i in range(len(dataSet)):
if classList[i]==0:
vocabSet2[0]=vocabSet2[0]|set(dataSet[i])
if classList[i]==1:
vocabSet2[1]=vocabSet2[1]|set(dataSet[i])
if classList[i]==2:
vocabSet2[2]=vocabSet2[2]|set(dataSet[i])
if classList[i]==3:
vocabSet2[3]=vocabSet2[3]|set(dataSet[i])
if classList[i]==4:
vocabSet2[4]=vocabSet2[4]|set(dataSet[i])
if classList[i]==5:
vocabSet2[5]=vocabSet2[5]|set(dataSet[i])
if classList[i]==6:
vocabSet2[6]=vocabSet2[6]|set(dataSet[i])
if classList[i]==7:
vocabSet2[7]=vocabSet2[7]|set(dataSet[i])
if classList[i]==8:
vocabSet2[8]=vocabSet2[8]|set(dataSet[i])
if classList[i]==9:
vocabSet2[9]=vocabSet2[9]|set(dataSet[i])
for i in range(10):
vocabSet1=vocabSet1 | vocabSet2[i]
return list(vocabSet1),list(vocabSet2)
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def classList2Vec(classList):
l=len(classList)
for i in range(l):
if classList[i]=='环境200':
classList[i]=0
elif classList[i]=='计算机200':
classList[i]=1
elif classList[i]=='交通214':
classList[i]=2
elif classList[i]=='教育220':
classList[i]=3
elif classList[i]=='经济325':
classList[i]=4
elif classList[i]=='军事249':
classList[i]=5
elif classList[i]=='体育450':
classList[i]=6
elif classList[i]=='医药204':
classList[i]=7
elif classList[i]=='艺术248':
classList[i]=8
elif classList[i]=='政治505':
classList[i]=9
return classList
def classifyNB(vec2Classify, re3, re):
p=[]
for i in range(len(re)):
p.append(sum(vec2Classify*re3[i])+np.log(re[i]))
return np.argmax(p)
#2 加载数据和训练部分
#2.1 加载数据
list1=os.listdir('d:/jiqixuexi')
docList=[]; classList = []; fullText =[]
for l in list1:
filepath=os.path.join('d:/jiqixuexi/',l)
if os.path.isdir(filepath):
for li in os.listdir(filepath):
#print(filepath,li)
wordList=textParse2(open(filepath+'/'+li,'r',encoding='utf8').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(filepath.split('/')[-1])
else:
pass
#2.2 一共三个文件夹:环境200,计算机200,交通215....从每个文件夹各选20篇组成测试集,剩下的文章组成训练集.
classList=classList2Vec(classList)
vocabList1, vocabList2= createVocabList(docList,classList)
trainingSet= list(range(len(docList)));testSet=[]
for i in range(200):
randIndex=int(np.random.uniform(0,len(docList)))
if randIndex==len(docList):randIndex-=1
testSet.append(trainingSet[randIndex])
trainingSet=list(set(trainingSet)-set(testSet))
testSet=list(set(testSet))
#del trainingSet[randIndex]
trainMat=[]; trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList1, docList[docIndex]))
trainClasses.append(classList[docIndex])
#2.3 开始训练
trainMatrix=np.array(trainMat)
trainCategory=np.array(trainClasses)
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
re=np.array([0]*10,dtype=np.float64)
for i in range(10):
re[i]=len(vocabList2[i])/len(vocabList1)
re1=[]
for i in range(10):re1.append([1]*numWords)
re1=np.array(re1,dtype=np.float64)
for i in range(numTrainDocs):
if trainCategory[i] == 0:
re1[0] += trainMatrix[i]
elif trainCategory[i] == 1:
re1[1] += trainMatrix[i]
elif trainCategory[i] == 2:
re1[2] += trainMatrix[i]
elif trainCategory[i] == 3:
re1[3] += trainMatrix[i]
elif trainCategory[i] == 4:
re1[4] += trainMatrix[i]
elif trainCategory[i] == 5:
re1[5] += trainMatrix[i]
elif trainCategory[i] == 6:
re1[6] += trainMatrix[i]
elif trainCategory[i] == 7:
re1[7] += trainMatrix[i]
elif trainCategory[i] == 8:
re1[8] += trainMatrix[i]
elif trainCategory[i] == 9:
re1[9] += trainMatrix[i]
re3=np.zeros(re1.shape)
for i in range(len(re1)):
re3[i]=re1[i]/(len(vocabList2[i])+len(vocabList1))
re3=np.log(re3)
#3 测试部分
errorCount = 0
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList1, docList[docIndex])
if classifyNB(np.array(wordVector),re3,re) != classList[docIndex]:
errorCount += 1
#错误率
print('error rate:%f'% (float(errorCount)/len(testSet)))
#分类错误的文章的数量
print('errorCount=%d'% errorCount)
#测试集的数量
print('测试集的数量:',len(np.unique(testSet)))
python10讲稿 bayes多分类(10类)
最新推荐文章于 2020-04-22 17:51:04 发布