Python写贝叶斯分类器

数据源
https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data

# coding:utf-8
import csv
import random
import math
#读取文件
def loadCsv(filename):
    lines = csv.reader(open(filename,'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return  dataset
#将数据分为训练集 和 测试集
def splitDataset(dataset, splitRatio):
    trainSize = int (len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize :
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet,copy]
#按类分类
def seperateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1]  not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated
#平均数
def mean(numbers):
    return sum(numbers)/float(len(numbers))
#方差
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers) - 1)
    return  math.sqrt(variance)
#求特征值
def summarize(dataset):
    summaries = [(mean(attribute),stdev(attribute) )for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries
#按类求特征值
def summarizeByClass(dataset):
    seperated = seperateByClass(dataset)
    summaries = {}
    for classValue,instance in seperated.items():
        summaries[classValue] = summarize(instance)
    return summaries
#计算测试数据在该类的可能性,采用高斯概率函数
def calculateProbability(x,mean,stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return  (1/(math.sqrt(2*math.pi)*stdev))*exponent
#按类计算可能性
def calculateClassProbabilities(summaries,inputVector):
    probabilities = {}
    for classValue , classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean,stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x,mean,stdev)
    return probabilities
#单一属性预测
def predic(summaries ,inputVector):
    probabilities = calculateClassProbabilities(summaries,inputVector)
    bestLabel ,bestProb = None, -1
    for classValue, probabilities in probabilities.items():
        if bestProb is None or probabilities > bestProb:
            bestProb = probabilities
            bestLabel = classValue
    return bestLabel
#综合预测
def getPredictions(summaries,testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predic(summaries,testSet[i])
        predictions.append(result)
    return predictions
#测试精度
def getAccurancy(testSet,predictions):
    correct = 0
    for i in range(len(testSet)) :
        if testSet[i][-1] == predictions[i]:
            correct+= 1
    return (correct/float(len(testSet))) * 100.0
#主函数
def main():
    filename = 'pima-indians-diabetes.data.csv'
    splitRatio = 0.7
    dataset = loadCsv(filename)
    trainingSet , testSet = splitDataset(dataset,splitRatio)
    summaries = summarizeByClass(trainingSet)
    predictions = getPredictions(summaries,testSet)
    accuracy = getAccurancy(testSet,predictions)
    print(accuracy)
main()
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值