项目介绍:利用adaboost对Employee满意度进行分类
0.整理数据
从https://www.datafountain.cn/datasets/12下载IBM员工满意度的虚拟数据,并做好员工满意度类型标签,并整理成txt档案。
1.分析目的
对员工满意度进行预测分类。
2. 分析代码
从实际分类效果来看,adaboost在测试集数据的错误率为20%,正确率约80%,测试效果还不错。
import pandas as pd
import numpy as np
%matplotlib inline
%matplotlib notebook
import matplotlib.pyplot as plt
from numpy import *
import adaboost
#导入训练数据
datArr,labelArr = adaboost.loadDataSet('HR_Employee_traindata2.txt')
#建立分类器
classifierArray,_ = adaboost.adaBoostTrainDS(datArr,labelArr,10)
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
#total error: 0.1891156462585034
classifierArray
[{'dim': 0, 'thresh': 13.8, 'ineq': 'lt', 'alpha': 0.727883366967329},
{'dim': 24, 'thresh': 2.4, 'ineq': 'gt', 'alpha': 0.12463342761307217},
{'dim': 10, 'thresh': 2.2, 'ineq': 'gt', 'alpha': 0.1003746703517236},
{'dim': 20, 'thresh': 3.1, 'ineq': 'lt', 'alpha': 0.0963189756522224},
{'dim': 8, 'thresh': 79.0, 'ineq': 'gt', 'alpha': 0.0949072890789229},
{'dim': 4, 'thresh': 2.2, 'ineq': 'gt', 'alpha': 0.08407359046211836},
{'dim': 3, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.0960231041915813},
{'dim': 26, 'thresh': 6.8, 'ineq': 'lt', 'alpha': 0.0948392081206876},
{'dim': 4, 'thresh': 3.0, 'ineq': 'lt', 'alpha': 0.06227497331058203},
{'dim': 18, 'thresh': 20.799999999999997, 'ineq': 'gt', 'alpha': 0.07452043991609442}]
#导入测试数据
testArr,testLabelArr = adaboost.loadDataSet('HR_Employee_testdata2.txt')
#进行分类
prediction10 = adaboost.adaClassify(testArr,classifierArray)
#错误统计
errArr=mat(ones((735,1)))
errnum = errArr[prediction10!=mat(testLabelArr).T].sum()
errnum
#150.0
#错误率
errnum/len(errArr)
#0.20408163265306123
3.adaboost源码
#coding=utf-8
from numpy import *
def loadSimpData():
datMat = matrix([[ 1. , 2.1],
[ 2. , 1.1],
[ 1.3, 1. ],
[ 1. , 1. ],
[ 2. , 1. ]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat,classLabels
#对数据进行分类
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):#just classify the data
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
#找到最佳决策树
def buildStump(dataArr,classLabels,D):
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf #最小错误率,开始初始化为无穷大
for i in range(n):#遍历数据集所有特征
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax-rangeMin)/numSteps #考虑数据特征,计算步长
for j in range(-1, int(numSteps) + 1): #遍历不同步长时的情况
for inequal in ['lt', 'gt']: #大于/小于阈值 切换遍历
threshVal = (rangeMin + float(j) * stepSize) #设置阈值
predictedVals = stumpClassify(dataMatrix, i, threshVal,inequal) #分类预测
errArr = mat(ones((m, 1)))#初始化全部为1(初始化为全部不相等)
errArr[predictedVals == labelMat] = 0#预测与label相等则为0,否则为1
# 分类器与adaBoost交互
# 权重向量×错误向量=计算权重误差(加权错误率)
weightedError = D.T * errArr
if weightedError < minError:
minError = weightedError #保存当前最小的错误率
bestClasEst = predictedVals.copy() #预测类别
#保存该单层决策树
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst #返回字典,错误率和类别估计
#完整adaboost算法
def adaBoostTrainDS(dataArr,classLabels,numIt=40): #numIt 用户设置的迭代次数
weakClassArr = []
m = shape(dataArr)[0]#m表示数组行数
D = mat(ones((m,1))/m) #初始化每个数据点的权重为1/m
aggClassEst = mat(zeros((m,1)))#记录每个数据点的类别估计累计值
for i in range(numIt):
# 建立一个单层决策树,输入初始权重D
bestStump,error,classEst = buildStump(dataArr,classLabels,D)
print ("D:",D.T)
# alpha表示本次输出结果权重
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))#1e-16防止零溢出
bestStump['alpha'] = alpha #alpha加入字典
weakClassArr.append(bestStump) #字典加入列表
print ("classEst: ",classEst.T)
# 计算下次迭代的新权重D
expon = multiply(-1*alpha*mat(classLabels).T,classEst)
D = multiply(D,exp(expon))
D = D/D.sum()
# 计算累加错误率
aggClassEst += alpha*classEst
print ("aggClassEst: ",aggClassEst.T)
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum()/m
print ("total error: ",errorRate)
if errorRate == 0.0: break#错误率为0时 停止迭代
return weakClassArr,aggClassEst
#测试adaboost
def adaClassify(datToClass,classifierArr):
dataMatrix = mat(datToClass)#待分类样例 转换成numpy矩阵
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):#遍历所有弱分类器
classEst = stumpClassify(dataMatrix,\
classifierArr[i]['dim'],\
classifierAr[i]['thresh'],\
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
print (aggClassEst) #输出每次迭代侯变化的结果
return sign(aggClassEst) #返回符号,大于0返回1,小于0返回-1
#在难数据集上应用
#自适应数据加载函数
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split('\t')) #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat-1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
4.百度硬盘分享
已将数据源和代码打包上传百度硬盘,若需要数据,请打赏任意金额并留下联系邮箱,将给予密码。
https://pan.baidu.com/s/1gP34nFySkN8QBQm5rqb77Q