前言
贝叶斯也是一种常用的分类器,主要用于
垃圾邮件分类,文本分类,智能监控的背景建模,人脸识别等方面
目录
- 基本概念
- 贝叶斯决策
- 朴素贝叶斯分类器
- 正态贝叶斯分类器
- 例子
一 基本概念
先看一个问题:
盒子 | 白球 | 黑球 | 抽中概率 |
A | 2 | 3 | 1/3 |
B | 2 | 2 | 1/3 |
C | 3 | 3 | 1/3 |
三个盒子,每个抽中概率都是1/3, 现在抽到一个白球(x=白色),这个球来自于
盒子A,B,C 的概率是多少(Y=a,b,c)
1.1 全概率公式
若事件c构成一个完备事件组,且都有正概率则,对任何一事件x
1.2 贝叶斯公式
回到上面例子:
x = 白球 则 p(x)=1/3(2/5 +1/2+ 3/5) = 1/2
P(y=A)*p(x=白球|y=A)= 1/3 * 2/5
则 P(y=A|x=白球)=( 2/15)/(1/2) = 4/15
1.3 事件独立
事件A发生的可能性不受到事件B的影响
性质1 : P(AB)= P(B)P(A)
性质2: 如A B 独立,独立
性质3: 如相互独立,则有
性质4:
证明用到:
二 贝叶斯决策
贝叶斯决策基于联合概率进行建模
因为p(x) 对不同的分类都是一致的,所以可以求解目标可以转为
三 朴素贝叶斯分类
由于x是多维的,朴素贝叶斯是在每个维度之间是条件独立的一种推导过程
2.1 离散类型
为了防止过小,使用对数
最后一项,对所有分类都相同,所以比较大小时候可以忽略
其中:
为了防止某个概率为0 ,会使用拉普拉斯平滑,
如果标签分为k类,比如y分为两类,k就是2,3类k就是3
2.2 连续类型
如果特征值是连续随机变量,可以假设分布服从正态分布。得到概率密度函数
分类器为:
如果是二分类, 取对数,
这里注意:
1: 在使用连续特征的时候,数据最好要标准化,否则测试出来差别会很大。
在例子中,3分类使用前准确率只有50%,标准化后,train集上准确率到达了90%
这里采用skearn库中的StandardScaler方法
import numpy as np
from sklearn.preprocessing import StandardScaler
def StandardData():
x =[[1,2,3],
[1,3,3],
[1,4,12]]
std = StandardScaler()
std.fit(x)
mean = std.mean_ ###每一列均值
scale_is = std.scale_ ###标准差
var_is = std.var_
print("\n mean: ",mean, "\n scale_is : ",scale_is, "\n var: ",var_is)
y = std.transform(x)
print("\n ***************\n")
print("标准化后 ",y)
四 正态贝叶斯分类器
假设样本特征向量服从多维正态分布,此时的贝叶斯分类器称为正态贝叶斯分类器
假设特征向量n维,其中u为均值向量,为协方差矩阵。条件概率密度为
因为是对称矩阵,可以借助奇异分解,求解行列式和逆矩阵,
所以
对角矩阵的逆矩阵是主对角元素的倒数
正交矩阵的逆矩阵,是其转置矩阵
预测算法:
求对数,进一步简化
在求逆矩阵的时候,如果直接用Inv函数求,协方差矩阵是奇异矩阵,会出错,
可以采用奇异分解的求解方法
五 例子
5.1 文本分类_ 离散朴素贝叶斯分类器
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 12 16:04:27 2019
@author: chengxf2
"""
import numpy as np
import matplotlib.pyplot as plt
"""
加载数据集
Args
None
returns:
dataList: 数据集
calssArr: 标签 1 侮辱性言论 0: 正常言论
"""
def LoadData():
dataList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classArr = [0,1,0,1,0,1] #1 is abusive, 0 not
return dataList, classArr
"""
把单词转换为set
Args:
dataList: 数据集
return
None
"""
def CreateWordSet(wordList):
dataSet =set([])
for word in wordList:
dataSet = dataSet|set(word)
print("\n dataSet: ", dataSet)
return list(dataSet)
"""
统计词频
Args
inputWord: 单独一句话
dataSet: 所有的单词
return
returnVec--单词出现的情况
"""
def GetWordFreq(wordSet, inputWord):
wordFreq =[0]*len(wordSet)
for word in inputWord:
if word in wordSet:
wordFreq[wordSet.index(word)]=1
else:
print("\n not Found")
# print("\n returnVec: ",returnFre)
return wordFreq
"""
得到对应的概率
Args
trainMat: 数据集
trainClass: 数据集分类
return
"""
def train(trainMat, trianClass):
m = len(trainMat) ##数据集个数
n = len(trainMat[0]) ##列的维度
pb = sum(trianClass)/float(m) #侮辱性语句概率
p0Num = np.ones(n)
p1Num = np.ones(n)
p0Demo = 2.0
p1Demo = 2.0
for i in range(m):
if trianClass[i]==1: ##侮辱性语句
p1Num += trainMat[i]
p1Demo += sum(trainMat[i])
else: #正常语句
p0Num += trainMat[i]
p0Demo += sum(trainMat[i])
p1 = p1Num/p1Demo
p0 = p0Num/p0Demo
#px = p1*pb +(1-pb)*p0
#print("\n *****px*****\n ",px)
return np.log(p0), np.log(p1),pb
def classify(word ,p0,p1, pClass):
p1 = sum(word*p1)+np.log(pClass)
p0 = sum(word*p0)+np.log(1.0-pClass)
if p1>p0:
print("\n 侮辱性言论 \n")
return 1
else:
print("\n 正常言论 \n")
return 1
"""
分类
Args
wordClass: 分类后的数据
p0: 侮辱性语句
p1: 正常语句出现的词频
pb: 正常语句出现的概率
"""
def Train():
wordList, wordClass = LoadData() ##加载数据集
wordSet = CreateSetList(wordList) ##建立词频列表
trainMat = []
for words in wordList:
wordT = Word2Vec(wordSet, words)
trainMat.append(wordT)
print("\n dataSet",trainMat)
p0,p1, pb =trainNB(trainMat, wordClass)
print("\n *********p0*********** \n",p0, "\n ******p1****** \n ",p1, "\n ******pb****** \n ",pb)
testWord = ['love','my','dog']
wordFre = GetWordFreq(wordSet,testWord)
classify(wordFre, p0,p1, pb)
testWord1 = ['stupid','garbage']
wordFre = GetWordFreq(wordSet,testWord1)
classify(wordFre, p0,p1, pb)
def Test():
x = np.arange(0.01, 0.9, 0.05)
y = np.log(x)
plt.plot(x,y)
plt.show()
tr = Train()
5.2 鸢尾花分类_ 连续型贝叶斯分类
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 16 14:04:20 2019
@author: chengxf2
"""
import numpy as np
from sklearn.datasets import load_iris ##鸢尾花数据集
import math
from sklearn.preprocessing import StandardScaler
"""
使用鸢尾花数据集、
数据集符合高斯分布
target_Name ['setosa' 'versicolor' 'virginica']
feature_names ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
"""
class GaussBayes:
def __init__(self):
self.m = 0 ##数据集个数
self.n = 0 ##数据集x维度
self.K = 2 ## 标签的分类,这里是2分类
self.EachEum = 50 ##每类的数目
self.LoadData()
self.Train()
self.Test()
"""
加载数据集
Args
鸢尾花数据集:load_iris():用于分类任务的数据集
retrun
trainData: 数据集
trainTarge: 训练集对应的标签
"""
def LoadData(self):
IRIS = load_iris()
iris_data = IRIS['data']
iris_target = IRIS['target']
iris_targetName = IRIS['target_names']
iris_featureNames = IRIS['feature_names']
num = self.K*self.EachEum
std = StandardScaler()
data = std.fit_transform(iris_data)
self.trainData = list(data[0:num])
self.trainTarget = list(iris_target[0:num])
self.m, self.n = np.shape(self.trainData)
#print("trainTarget ",self.trainTarget)
print("\n m: \n ",self.m, "\t n: ",self.n)
#print("\n data: ",data)
# print("\n trainData ",self.trainData)
"""
获得数据对应的每一个维度的标准差和均值
args
dataList
return
mean: 均值
std: 标准差
"""
def GetPara(self, dataList):
para =[]
for i in range(self.n):
data = [item[i] for item in dataList] ##相当于取某一列
mean = np.mean(data) ##均值
std = np.std(data) ##标准差
para.append([mean,std])
return para
"""
训练
Args
None
retrun:
pi: 每类样本的概率
ui: 平均值
qi: 标准差
"""
def Train(self):
pDict={} ##样本分类的概率p(y)
qDict={} ##p(x=v|y=c)
for k in range(self.K):
pDict[k]=[]
qDict[k]=[]
for key in pDict:
num = self.trainTarget.count(key)
pDict[key]= np.log(num/self.m)
print("\n pDict: \n ",pDict)
for k in range(self.K):
low = k*self.EachEum
up = (k+1)*self.EachEum
curData = self.trainData[low:up] ##取对应的数据
para =self.GetPara(curData)
qDict[k]=para
print("\n qDict \n",qDict)
self.pDict = pDict
self.qDict = qDict
"""
连续型特征后面的高斯概率密度函数
Args
data: 数据项
q: 包括, std,mean
args:
概率
"""
def GetGauss(self, data,q):
prob = 0.0
for index in range(self.n):
x = data[index]
mean = q[index][0]
std = q[index][1]
a = 1.0/(std*np.sqrt(2*math.pi))
b = np.power(x-mean, 2)
c = 2*np.power(std,2)
gi = (b/c)*np.log(a)
prob=prob-gi
return prob
"""
分类
Args
data: 数据集
return
分类结果
"""
def Classify(self, data):
kList = [-1 for i in range(self.K)]
for k in range(self.K): ##代表分类
gau =0.0
#print("\k : ",k)
p = self.pDict[k] ##类似惯性因子
q =self.qDict[k] ##类似随机场系数
qi = self.GetGauss(data, q)
gau = p+qi
#print("qdict :", q, "gau ",gau, "qi: ",qi)
kList[k]=gau
# print("\n k :",k, dictK)
k = np.argmax(kList)
#print("k: ",k, "\t ",kList)
return k
"""
测试
"""
def Test(self):
error =0
for i in range(self.m):
data = self.trainData[i] ##获取数据
k = self.Classify(data)
if k != self.trainTarget[i]:
#print("\n i: ",i, "\t ",self.trainTarget[i], "\t k: ",k)
error = error+1
print("\n error: ",error, " 错误率: \t ", error/self.m)
def Temp():
b = [-1 for i in range(3)]
print("\t b ",b)
#Temp()
bayes = GaussBayes()
5.3 声纳对雷达波反射的例子
数据集: http://t.cn/Rf8GrP7
声纳数据集(Sonar Dataset )涉及预测根据给定声纳从不同角度返回的强度预测目标物体是岩石还是矿井。
它是一个二元分类问题。每个类的观察值数量不均等。一共有208个观察值,60个输入变量和1个输出变量。变量名如下:
从不同角度返回的声纳
训练集上精度: 100%
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 19 15:40:45 2019
@author: chengxf2
"""
import numpy as np
import sys
import os
from sklearn.preprocessing import StandardScaler
class NormalBayes:
"""
获取对应的均值,行列式,方差,逆矩阵
Args:
None
return
None
"""
def GetPara(self, dataList, dataLabel):
data1 =[]
data2 =[]
m = len(dataList)
type1 = self.LableList[0]
print("\n m: ",m, " 第一类:",type1)
for i in range(m):
label = dataLabel[i]
data = dataList[i]
print("label: ",label )
if type1 == label: ##G
data1.append(data)
else:
data2.append(data)
##均值
print("\n data1 ",data1)
print("\n data2 ",data2)
self.mean1 = np.mean(data1, axis=0)
self.mean2 = np.mean(data2, axis= 0)
print("\n 男生平均身高: ",self.mean1, "\n 女生平均身高:",self.mean2)
cov1 = np.cov(data1, rowvar =False, bias=False)
cov2 = np.cov(data2, rowvar =False, bias=False)
# print("\n 男生协方差 \n",covMan,"\n 女生协方差: \n", covGirl)
self.cov1 = np.mat(cov1) ###协方差
self.cov2 = np.mat(cov2) ##协方差
U1,W1,U1T = np.linalg.svd(self.cov1) ##奇异分解
U2,W2,U2T = np.linalg.svd(self.cov2) ##奇异分解
num1 = len(W1)
num2 = len(W2)
# print("numM ",numM)
W_1 = np.zeros((num1,num1))
W_2= np.zeros((num1,num1))
for i in range(num1):
W_1[i,i]= W1[i]
for j in range(num1):
W_2[j,j]=W2[j]
##协方差的逆矩阵
#print("\n UMan:\n ",UMan)
#print("\n WMan: \n ",WMan)
inv1 = U1*np.linalg.inv(W_1)*U1T
inv2 = U2*np.linalg.inv(W_2)*U2T
###求协方差的行列式
delt1 = np.linalg.det(W_1)
delt2= np.linalg.det(W_2)
print("\n 行列式男::: \n ",delt1, "\n 行列式女: \n",delt2)
# print("\n ********invMan*******\n",invMan)
# print("\n ********invGril*******\n",invGril)
###协方差的逆矩阵
self.Inv1 = inv1
self.Inv2 = inv2
##协方差的行列式
self.del1 = delt1
self.del2 = delt2
print("Inv: \n",self.Inv1, " Inv2: \n ",self.Inv2)
print("Inv: \n",self.del1, " Inv2: \n ",self.del2)
"""
加载数据集
Args
None
return
filePath
"""
def GetFile(self):
fileName = "data\\sonar.txt"
#sonar #data2
filePath = os.path.abspath(fileName)
return filePath
"""
加载数据
"""
def LoadData(self):
path = self.GetFile()
fp = open(path)
lines = fp.readlines()
dataList =[]
dataLabel=[]
for line in lines:
# print("\n line: ",line)
lineList = line.strip().split(',')
if len(lineList)<=1:
continue
Label = lineList[-1]
datastr = lineList[0:-1]
data = [float(i) for i in datastr]
dataLabel.append(Label.strip())
#print("\n data: ",data)
#print("\n data: ",type(data),"\t dataLabel ",dataLabel)
dataList.append(data)
#print("\n dataLabel: \n ",dataLabel)
#print("\n 归一化前: ",dataList[0])
std = StandardScaler()
self.dataList = dataList
#self.dataList = std.fit_transform(dataList)
self.dataLabel = dataLabel
self.LableList = list(set(dataLabel))
n1 = dataLabel.count(dataLabel[0])
self.m,self.n = np.shape(dataList)
self.p1 = np.log(n1/self.m)
self.p2 = np.log((self.m-n1)/self.m)
self.GetPara(self.dataList, self.dataLabel)
print("\n m: ",self.m, "n: ",self.n, "\t p1: ",self.p1, "p2: ",self.p2)
#print("\n m : ",self.m, "\t n :",self.n)
#print("\n 归一化后: ",self.dataList[0])
"""
实际上等价于求最小概率
Args
data 数据
mean: 当前维度的均值
det: 此类的行列式
INV: 逆矩阵
return:
概率,越小越好
"""
def Getp(self, data,mean,det,INV,pc):
x = data-mean
t1 = np.log(det)
t2 = x*INV*x.T
# print("\n t1",t1, "\n t2: ",t2)
# print("\n data: ",type(xb), "sp: ",np.shape(self.invB),"deltB: ",self.deltB) #(1,34)
p =t1+t2-pc
return p
"""
预测
Arg
dataList: 数据集
"""
def Predeict(self,dataList):
num1 = 0
num2 = 0
num = len(dataList)
error = 0
for i in range(num):
data = dataList[i]
label = self.dataLabel[i]
p1= self.Getp(np.mat(data), self.mean1, self.del1, self.Inv1,1)
p2 = self.Getp(np.mat(data), self.mean2, self.del2, self.Inv2,1)
#print("\n pb ",pb, "\t pg: ",pg)
PredeictLabel=self.LableList[0]
if p1<p2:
num1 =num1+1
else:
PredeictLabel =self.LableList[1]
num2 =num2+1
if PredeictLabel != label:
error =error+1
#print("i ",i )
#print("g: ")
print("\n 男生 ",num1 , "\t 女生 : ",num2, " 错误率: ",error)
def __init__(self):
self.m = 0 ##样本数
self.n = 0 ##样本维度
self.mean = None ##均值
self.cov = None ##协方差
self.LoadData()
self.Predeict(self.dataList)
BAYSE = NormalBayes()