前段时间学习python3的基础知识,现在做一些数据挖掘方面的小练习。
今天要做的事贝叶斯分类器,数据来源为老师的weather.arff, 首先要读取这个文件,要用到string的方法http://write.blog.csdn.net/postlist和文件读取方法http://write.blog.csdn.net/postlist 和arff文件格式http://blog.sina.com.cn/s/blog_9d40b61301012xci.html,程序如下:
# coding =utf-8
import re
import sys
def readArff(fileName):
arffFile = open(fileName,'r')
data = []
for line in arffFile.readlines():
if not (line.startswith('@')):
if not (line.startswith('%')):
if line !='\n':
L=line.strip('\n')
k=L.split(',')
data.append(k)
print(k)
print(data)
if __name__ =='__main__':
fileName=r'C:\Users\Administrator\Desktop\exepirenment\classifill\data\weather.arff'
readArff(fileName)
输出结果如下:
['sunny', '85', '85', 'FALSE', 'no']
['sunny', '80', '90', 'TRUE', 'no']
['overcast', '83', '86', 'FALSE', 'yes']
['rainy', '70', '96', 'FALSE', 'yes']
['rainy', '68', '80', 'FALSE', 'yes']
['rainy', '65', '70', 'TRUE', 'no']
['overcast', '64', '65', 'TRUE', 'yes']
['sunny', '72', '95', 'FALSE', 'no']
['sunny', '69', '70', 'FALSE', 'yes']
['rainy', '75', '80', 'FALSE', 'yes']
['sunny', '75', '70', 'TRUE', 'yes']
['overcast', '72', '90', 'TRUE', 'yes']
['overcast', '81', '75', 'FALSE', 'yes']
['rainy', '71', '91', 'TRUE', 'no']
读取完数据后,则开始写贝叶斯算法,贝叶斯原理不明白的请看我的新浪博客: http://blog.sina.com.cn/s/blog_bee847a20102v1wz.html。完整程序如下:
# coding =utf-8
import re
import sys
import bisect
data =[] #全局变量
def readArff(fileName):
arffFile = open(fileName,'r')
global data
for line in arffFile.readlines():
if not (line.startswith('@')):
if not (line.startswith('%')):
if line !='\n':
L=line.strip('\n')
k=L.split(',')
data.append(k)
def bayesion(testData):
class1=[]
class2=[]
global data
for item in data:
if item[len(item)-1] == 'yes':
class1.append(item)
else:
class2.append(item)
class1Probability = len(class1) /len(data)
class2Probability = len(class2) /len(data)
for i in range(len(testData)):
count = 0
for elem in class1:
if testData[i]==elem[i]:
count +=1 #统计个数
class1Probability *= count/len(class1) #累计乘法 求总概率
count = 0
for elem in class2:
if testData[i]==elem[i]:
count +=1
class2Probability *=count/len(class2)
if class1Probability >class2Probability: #比较,进而分类
print("The result is : Yes")
else:
print("The result if : No")
#数据预处理,将data数据分箱,data数据为 list[list1,list2...]类型
def dataPreprocessing1(data):
breakpoint1 =[70,80]
breakpoint2=[80,90]
newValue1='LMH'
for item in data:
i = bisect.bisect(breakpoint1,int(item[1])) #int(),str()等类型需要转换
item[1]=str(newValue1[i])
j = bisect.bisect(breakpoint2,int(item[2]))
item[2]=str(newValue1[j])
#数据预处理,分箱,针对单个list数据,data为list['..','..']类型
def dataPreprocessing2(data):
breakpoint1 =[70,80]
breakpoint2=[80,90]
newValue1='LMH'
i = bisect.bisect(breakpoint1,int(data[1]))
data[1]=str(newValue1[i])
j = bisect.bisect(breakpoint2,int(data[2]))
data[2]=str(newValue1[j])
if __name__ =='__main__':
fileName=r'C:\Users\Administrator\Desktop\exepirenment\classifill\data\weather.arff'
readArff(fileName)
dataPreprocessing1(data)
testData =['overcast','72','80','TRUE']
dataPreprocessing2(testData)
bayesion(testData)
输出结果:
The result is : Yes