importnumpy as npimportrandomfrom sklearn.metrics importf1_scoreimportxgboostclassGeneticXgboost:def __init__(self,num_parents=None):"""param num_parents:种群个体的数量"""self.num_parents=num_parentsdefinitilialize_poplulation(self):"""初始化种群,即生成规定数量的种群的基因
learning_rate,n_estimators,max_depth,min_child_weightsubsample,olsample_bytree,gamma
return:array,shape=[self.num_parents,num_gene]"""learningRate= np.empty([self.num_parents, 1])
nEstimators= np.empty([self.num_parents, 1],dtype =np.uint8)
maxDepth= np.empty([self.num_parents, 1],dtype =np.uint8)
minChildWeight= np.empty([self.num_parents,1])
gammaValue= np.empty([self.num_parents,1])
subSample= np.empty([self.num_parents,1])
colSampleByTree= np.empty([self.num_parents,1])for i inrange(self.num_parents):#生成每个个体
learningRate[i] = round(np.random.uniform(0.01, 1), 2)
nEstimators[i]= int(random.randrange(10, 1500, step = 25))
maxDepth[i]= int(random.randrange(1, 10, step=1))
minChildWeight[i]= round(random.uniform(0.01, 10.0),2)
gammaValue[i]= round(random.uniform(0.01, 10.0),2)
subSample[i]= round(random.uniform(0.01, 1.0), 2)
colSampleByTree[i]= round(random.uniform(0.01, 1.0), 2)
population=np.concatenate((learningRate,nEstimators,maxDepth,minChildWeight,
gammaValue,subSample,colSampleByTree),axis=1)returnpopulationdeffitness_function(self,y_true,y_pred):"""定义适应度函数"""fitness= round((f1_score(y_true,y_pred,average='weighted')),4)returnfitnessdeffitness_compute(self,population,dMatrixTrain,dMatrixtest,y_test):"""计算适应度值
param population: 种群
param dMatrixTrain:训练数据,(X,y)
param dMatrixtest: 测试数据, (x,y)
param y_test: 测试数据y
return 种群中每个个体的适应度值"""f1_Score=[]for i in range(population.shape[0]):#遍历种群中的每一个个体
param = {'objective': 'binary:logistic','learning_rate': population[i][0],'n_estimators': population[i][1],'max_depth': int(population[i][2]),'min_child_weight': population[i][3],'gamma': population[i][4],'subsample': population[i][5],'colsample_bytree': population[i][6],'seed': 24}
num_round= 100model=xgboost.train(param,dMatrixTrain,num_round)
preds=model.predict(dMatrixtest)
preds= preds>0.5f1=self.fitness_function(y_test,preds)
f1_Score.append(f1)returnf1_Scoredefparents_selection(self,population,fitness,num_store):"""根据适应度值来选择保留种群中的个体数量
param population:种群,shape=[self.num_parents,num_gene]
param num_store: 需要保留的个体数量
param fitness: 适应度值,array
return 种群中保留的最好个体,shape=[num_store,num_gene]"""
#用于存储需要保留的个体
selectedParents = np.empty((num_store,population.shape[1]))for parentId inrange(num_store):#找到最大值的索引
bestFitnessId = np.where(fitness ==np.max(fitness))
bestFitnessId=bestFitnessId[0][0]#保存对应的个体基因
selectedParents[parentId,:] =population[bestFitnessId, :]#将提取了值的最大适应度赋值-1,避免再次提取到
fitness[bestFitnessId] = -1
returnselectedParentsdefcrossover_uniform(self,parents,childrenSize):"""交叉
我们使用均匀交叉,其中孩子的每个参数将基于特定分布从父母中独立地选择
param parents:
param childrenSize:
return"""crossoverPointIndex= np.arange(0,np.uint8(childrenSize[1]),1,dtype=np.uint8)
crossoverPointIndex1= np.random.randint(0,np.uint8(childrenSize[1]),
np.uint8(childrenSize[1]/2))
crossoverPointIndex2= np.array(list(set(crossoverPointIndex)-set(crossoverPointIndex1)))
children=np.empty(childrenSize)#将两个父代个体进行交叉
for i inrange(childrenSize[0]):#find parent1 index
parent1_index = i%parents.shape[0]#find parent 2 index
parent2_index = (i+1)%parents.shape[0]#insert parameters based on random selected indexes in parent1
children[i,crossoverPointIndex1] =parents[parent1_index,crossoverPointIndex1]#insert parameters based on random selected indexes in parent1
children[i,crossoverPointIndex2] =parents[parent2_index,crossoverPointIndex2]returnchildrendefmutation(self, crossover, num_param):'''突变
随机选择一个参数并通过随机量改变值来引入子代的多样性
param crossover:要进行突变的种群
param num_param:参数的个数
return'''
#定义每个参数允许的最小值和最大值
minMaxValue = np.zeros((num_param,2))
minMaxValue[0,:]= [0.01, 1.0] #min/max learning rate
minMaxValue[1,:] = [10, 2000] #min/max n_estimator
minMaxValue[2,:] = [1, 15] #min/max depth
minMaxValue[3,:] = [0, 10.0] #min/max child_weight
minMaxValue[4,:] = [0.01, 10.0] #min/max gamma
minMaxValue[5,:] = [0.01, 1.0] #min/maxsubsample
minMaxValue[6,:] = [0.01, 1.0] #min/maxcolsample_bytree
#突变随机改变每个后代中的单个基因
mutationValue =0
parameterSelect= np.random.randint(0,7,1)if parameterSelect ==0:#learning_rate
mutationValue = round(np.random.uniform(-0.5, 0.5), 2)if parameterSelect == 1:#n_estimators
mutationValue = np.random.randint(-200, 200, 1)if parameterSelect == 2:#max_depth
mutationValue = np.random.randint(-5, 5, 1)if parameterSelect == 3:#min_child_weight
mutationValue = round(np.random.uniform(5, 5), 2)if parameterSelect == 4:#gamma
mutationValue = round(np.random.uniform(-2, 2), 2)if parameterSelect == 5:#subsample
mutationValue = round(np.random.uniform(-0.5, 0.5), 2)if parameterSelect == 6:#colsample
mutationValue = round(np.random.uniform(-0.5, 0.5), 2)#通过更改一个参数来引入变异,如果超出范围则设置为max或min
for idx inrange(crossover.shape[0]):
crossover[idx, parameterSelect]= crossover[idx,parameterSelect]+mutationValueif(crossover[idx,parameterSelect]>minMaxValue[parameterSelect,1]):
crossover[idx,parameterSelect]= minMaxValue[parameterSelect,1]if(crossover[idx,parameterSelect]
crossover[idx,parameterSelect]=minMaxValue[parameterSelect,0]returncrossover######################参数收缩测试##############################################
from sklearn.datasets importload_breast_cancerfrom sklearn.model_selection importtrain_test_splitfrom sklearn.preprocessing importStandardScaler
X,y= load_breast_cancer(return_X_y=True)
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=1)
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)
xgDMatrixTrain=xgboost.DMatrix(X_train,y_train)
xgbDMatrixTest=xgboost.DMatrix(X_test, y_test)
number_of_parents= 8 #初始种群数量
number_of_generations = 4 #种群繁殖代数,即迭代次数
number_of_parameters = 7 #将被优化的参数数量
number_of_parents_mating = 4 #每代被保留的个体数量
gx= GeneticXgboost(num_parents=number_of_parents)#定义种群的大小
populationSize =(number_of_parents,number_of_parameters)#初始种群
population =gx.initilialize_poplulation()#定义一个数组来存储fitness历史
FitnessHistory = np.empty([number_of_generations+1, number_of_parents])#定义一个数组来存储每个父节点和生成的每个参数的值
populationHistory = np.empty([(number_of_generations+1)*number_of_parents,
number_of_parameters])#历史记录中插入初始参数的值
populationHistory[0:number_of_parents,:] =population#训练
for generation inrange(number_of_generations):print("This is number %s generation" %(generation))#train the dataset and obtain fitness
FitnessValue = gx.fitness_compute(population=population,
dMatrixTrain=xgDMatrixTrain,
dMatrixtest=xgbDMatrixTest,
y_test=y_test)
FitnessHistory[generation,:]=FitnessValueprint('Best F1 score in the iteration = {}'.format(np.max(FitnessHistory[generation,:])))#保留的父代
parents = gx.parents_selection(population=population,
fitness=FitnessValue,
num_store=number_of_parents_mating)#生成的子代
children = gx.crossover_uniform(parents=parents,
childrenSize=(populationSize[0]-parents.shape[0],number_of_parameters))#增加突变以创造遗传多样性
children_mutated =gx.mutation(children, number_of_parameters)#创建新的种群,其中将包含以前根据fitness value选择的父代,和生成的子代
population[0:parents.shape[0], :] =parents
population[parents.shape[0]:, :]=children_mutated
populationHistory[(generation+1)*number_of_parents:(generation+1)*number_of_parents+number_of_parents,:]=population#最终迭代的最佳解决方案
fitness = gx.fitness_compute(population=population,
dMatrixTrain=xgDMatrixTrain,
dMatrixtest=xgbDMatrixTest,
y_test=y_test)
bestFitnessIndex= np.where(fitness ==np.max(fitness))[0][0]print("Best fitness is =", fitness[bestFitnessIndex])print("Best parameters are:")print('learning_rate=', population[bestFitnessIndex][0])print('n_estimators=', population[bestFitnessIndex][1])print('max_depth=', int(population[bestFitnessIndex][2]))print('min_child_weight=', population[bestFitnessIndex][3])print('gamma=', population[bestFitnessIndex][4])print('subsample=', population[bestFitnessIndex][5])print('colsample_bytree=', population[bestFitnessIndex][6])