弱分类器生成:
allWeakCLF=[]
for feaInd in range(data.shape[1]-1):
tempD=data.iloc[:,[feaInd,-1]]
tempD=tempD.sort(columns=tempD.columns[0], ascending=False)
tempD=np.array(tempD)
bestCLF=[0.0, 100000, 0] #threshold, errorNum, 0 or 1(flag)
#0: errorNum is based on ">= threshold is positive"
#1: errorNum is based on ">= threshold is negative"
curPosNum=0
curNegNum=0
for threshold, label in tempD:
if label==1:
curPosNum+=1
else:
curNegNum+=1
errorNum0=(posNum-curPosNum)+curNegNum
errorNum1=(negNum-curNegNum)+curPosNum
if errorNum0>=errorNum1:
if errorNum1<bestCLF[1]:
bestCLF=[threshold, errorNum1, 1] #threshold, errorNum, 0 or 1(flag)
else: #errorNum0<errorNum1
if errorNum0<bestCLF[1]:
bestCLF=[threshold, errorNum0, 0] #threshold, errorNum, 0 or 1(flag)
allWeakCLF.append(bestCLF)
joblib.dump(allWeakCLF, 'allWeakCLF.pkl')
#allWeakCLF=joblib.load('allWeakCLF.pkl')
随机方式生成强分类器或固定方式生成强分类器:
def MyAdaBoost(data, WeakClfInfo):
#1: init
dataWeight=[1.0/len(data)]*len(data)
#2: generate strong clf
#errorNum, ind, threshold, 0 or 1(flag), clfWeight(init with 1)
for j, (errorNum, ind, threshold, flag, clfWeight) in enumerate(WeakClfInfo): #for each weak clf
#2.1: caculate normalized weighted error(delta)
errorFlag=[0]*len(data) #0:correct, 1:incorrect
errorWeight=0.0
totalWeight=0.0
if flag==0: #0: errorNum is based on ">= threshold is positive"
for i, (value, label) in enumerate(data[:,[ind, -1]]): #for each data
totalWeight+=dataWeight[i]
if value>=threshold and label==0:
errorWeight+=dataWeight[i]
errorFlag[i]=1
elif value<threshold and label==1:
errorWeight+=dataWeight[i]
errorFlag[i]=1
else: #flag==1, #1: errorNum is based on ">= threshold is negative"
for i, (value, label) in enumerate(data[:,[ind, -1]]): #for each data
totalWeight+=dataWeight[i]
if value>=threshold and label==1:
errorWeight+=dataWeight[i]
errorFlag[i]=1
elif value<threshold and label==0:
errorWeight+=dataWeight[i]
errorFlag[i]=1
delta=errorWeight/totalWeight
delta=math.sqrt((1-delta)/delta)
#2.2: update dataWeight according to delta
for i, flag in enumerate(errorFlag):
if flag==0: #0:correct, 1:incorrect
dataWeight[i]/=delta
else:
dataWeight[i]*=delta
#2.3: update weak clf weight(and errorNum) according to delta
WeakClfInfo[j][0]=delta #errorNum ==> normalized weighted error(delta)
WeakClfInfo[j][4]=math.log(delta) #clfWeight
#not WeakClfInfo[0]!!!!!! take a long time to find this bug!!!!
#3: return the strong clf(WeakClfInfo now has the strong info)
return WeakClfInfo
#how many strClf we want to train, weakClfNum in each strClf
def RandomGenerateStrClf(data, allWeakCLF, strClfNum=10, weakClfNum=60):
randomStrClf=[]
weakClfInd=range(len(allWeakCLF)) #weakClfInd~=feaInd
for i in range(strClfNum):
print "training the", i+1, "strong classifier", "^_^"*10
sampledWeakClfInd=random.sample(weakClfInd,weakClfNum)
sampledWeakClfInfo=[] #errorNum, ind, threshold, 0 or 1(flag), clfWeight(init with 1)
for ind in sampledWeakClfInd:
sampledWeakClfInfo.append([allWeakCLF[ind][1], ind, allWeakCLF[ind][0], allWeakCLF[ind][2], 1])
sortedSampledWeakClfInfo=sorted(sampledWeakClfInfo)
strongClfInfo=MyAdaBoost(data, sortedSampledWeakClfInfo)
#strongClfInfo ==> list of [errorNum->delta, ind, threshold, 0 or 1(flag), new_clfWeight]
randomStrClf.append(strongClfInfo)
return randomStrClf
#how many strClf we want to train, weakClfNum in each strClf
def StaticGenerateStrClf(data, sortedWeakClfInfo, strClfNum=10, weakClfNum=20):
staticStrClf=[]
step=len(allWeakCLF)/strClfNum
if step>weakClfNum:
step=weakClfNum
for i in range(strClfNum):
print "training the", i+1, "strong classifier", "^_^"*10
strongClfInfo=MyAdaBoost(data, sortedWeakClfInfo[i*step:i*step+weakClfNum])
#strongClfInfo ==> list of [errorNum->delta, ind, threshold, 0 or 1(flag), new_clfWeight]
staticStrClf.append(strongClfInfo)
return staticStrClf