基于遗传算法的特征选择器（以红酒数据集为例）

最新推荐文章于 2024-06-22 11:35:44 发布

腾风扶摇

最新推荐文章于 2024-06-22 11:35:44 发布

阅读量3.4k

点赞数

文章标签： python 机器学习开发语言

本文链接：https://blog.csdn.net/weixin_43982319/article/details/123504730

版权

我们结合遗传算法与KNN算法设计了基于GA-KNN的红酒特征选择器首先将红酒数据集的不同特征组合数值化为个体的基因序列，初始化种群，经过交叉、变异等进行种群的迭代，以KNN算法的预测正确率为个体的适应值，同时结合网格搜索寻找最优的估计器参数。在迭代过程中对红酒数据集的特征组合进行寻优，排除冗余特征，找出KNN测试结果最好的一组（或几组）特征集合，实现特征降维。

分文件编写，需要提前安装numpy pandas sklearn插件

代码实现如下：

Feature_selection_genetic_algorithm.py

# -*- encoding: utf-8 -*-

import random
import math
import numpy as np
import pandas as pd
from Genetic_algorithm import GA
import matplotlib.pyplot as plt
#import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

class FeatureSelection(object):
    def __init__(self, aLifeCount=10):
        self.columns = ['target', 'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides',
                        'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']
        self.wine = pd.read_csv("winequality_red.csv", low_memory=False, usecols=self.columns)

       # self.Xtrain, self.Xtest, self.Ytrain, self.Ytest = train_test_split(self.wine , self.wine.target, test_size=0.1)

        self.lifeCount = aLifeCount
        self.ga = GA(aCrossRate=0.6,
                     aMutationRage=0.1,
                     aLifeCount=self.lifeCount,
                     aGeneLenght=len(self.columns) - 1,
                     aMatchFun=self.matchFun())

    def knn_score(self, order):

        #print("order:  " ,order)
        features = self.columns[1:]
        features_name = []
        features_name.append(self.columns[0])
        #print("features:  ", features)
        for index in range(len(order)):
            if order[index] == 1:
                features_name.append(features[index])
        #特征名
        #print("features_name:  " ,features_name)
        params = {
            'boosting': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
            'train_metric': False,
            'subsample': 0.8,
            'learning_rate': 0.8,
            'num_leaves': 96,
            'num_threads': 4,
            'max_depth': 5,
            'colsample_bytree': 0.8,
            'lambda_l2': 0.01,
            'verbose': -1,  # inhibit print info #
        }
        rounds = 100

        self.Xtrain, self.Xtest, self.Ytrain, self.Ytest = train_test_split(self.wine[features_name],self.wine[features_name].target, test_size=0.4)
        #print("self.wine[features_name]:  ", self.wine[features_name])
        #标准化
        transfer = StandardScaler()
        self.Xtrain = transfer.fit_transform(self.Xtrain)
        #print("self.Xtrain  ", self.Xtrain)
        self.Xtest = transfer.transform(self.Xtest)
        estimator = KNeighborsClassifier()
        # 加入网格搜索与交叉验证
         #参数准备
        param_dict = {"n_neighbors": [1, 3, 5, 7, 9]}
        estimator = GridSearchCV(estimator, param_grid=param_dict, cv=5)
        estimator.fit(self.Xtrain, self.Ytrain)

        # 方法2：计算准确率
        score = estimator.score(self.Xtest,self.Ytest)
        print("score：\n",score)
        # 最佳参数：best_params_
        print("最佳参数：\n", estimator.best_params_)
        # 最佳结果：best_score_
        print("最佳结果：\n", estimator.best_score_)
        # 最佳估计器：best_estimator_
        print("最佳估计器:\n", estimator.best_estimator_)
        # 交叉验证结果：cv_results_
        #print("交叉验证结果:\n", estimator.cv_results_)
        return score

    def matchFun(self):
        return lambda life: self.knn_score(life.gene)
    def run(self, n=0):
        distance_list = []
        generate = [index for index in range(1, n + 1)]
        while n > 0:
            self.ga.next()
            # distance = self.auc_score(self.ga.best.gene)
            distance = self.ga.score                      ####
            distance_list.append(distance)
            print(("第%d代 : 当前最好特征组合的线下验证结果为：%f") % (self.ga.generation, distance))
            n -= 1

        print('当前最好特征组合:')
        string = []
        flag = 0
        features = self.columns[1:]
        for index in self.ga.gene:                                  ####
            if index == 1:
                string.append(features[flag])
            flag += 1
        print(string)
        print('最高knn_score：', self.ga.score)                      ####

        '''画图函数'''
        plt.plot(generate, distance_list)
        plt.xlabel('generation')
        plt.ylabel('knn-score')
        plt.title('generation--knn-score')
        plt.grid()
        plt.show()


def main():
    fs = FeatureSelection(aLifeCount=5)
    rounds = 10    # 算法迭代次数 #
    fs.run(rounds)


if __name__ == '__main__':
    main()

Genetic_algorithm.py

# -*- coding: utf-8 -*-

import copy
import random
from Life import Life
import numpy as np

class GA(object):
    """遗传算法类"""

    def __init__(self, aCrossRate, aMutationRage, aLifeCount, aGeneLenght, aMatchFun=lambda life: 1):
        self.croessRate = aCrossRate  # 交叉概率 #
        self.mutationRate = aMutationRage  # 突变概率 #
        self.lifeCount = aLifeCount   # 个体数 #
        self.geneLenght = aGeneLenght  # 基因长度 #
        self.matchFun = aMatchFun  # 适配函数
        self.lives = []  # 种群
        self.best = None  # 保存这一代中最好的个体
        self.gene = np.random.randint(0, 2, self.geneLenght)  # 保存全局最好的个体 #
        self.score = -1   # 保存全局最高的适应度 #
        self.generation = 0  # 第几代 #
        self.crossCount = 0  # 交叉数量 #
        self.mutationCount = 0  # 突变个数 #
        self.bounds = 0.0  # 适配值之和，用于选择时计算概率
        self.initPopulation()  # 初始化种群 #

    def initPopulation(self):
        """初始化种群"""
        self.lives = []
        for i in range(self.lifeCount):
            gene = np.random.randint(0, 2, self.geneLenght)
            random.shuffle(gene)  # 随机洗牌 #
            life = Life(gene)
            self.lives.append(life)

    def judge(self):
        """评估，计算每一个个体的适配值"""
        self.bounds = 0.0
        self.best = self.lives[0]
        for life in self.lives:
            life.score = self.matchFun(life)
            self.bounds += life.score
            if self.best.score < life.score:     # score为knn_score 越大越好 #
                self.best = life

        if self.score < self.best.score:                          ####
            self.score = copy.deepcopy(self.best.score)           ####
            self.gene = copy.deepcopy(self.best.gene)             ####

        self.best.score = copy.deepcopy(self.score)               ####
        self.best.gene = copy.deepcopy(self.gene)                 ####

    def cross(self, parent1, parent2):
        """
        函数功能：交叉
        函数实现：随机交叉长度为n的片段，n为随机产生
        """
        index1 = random.randint(0, self.geneLenght - 1)  # 随机生成突变起始位置 #
        index2 = random.randint(index1, self.geneLenght - 1)  # 随机生成突变终止位置 #

        for index in range(len(parent1.gene)):
            if (index >= index1) and (index <= index2):
                parent1.gene[index], parent2.gene[index] = parent2.gene[index], parent1.gene[index]

        self.crossCount += 1
        return parent1.gene

    def mutation(self, gene):
        """突变"""
        index1 = random.randint(0, self.geneLenght - 1)
        index2 = random.randint(0, self.geneLenght - 1)
        # 随机选择两个位置的基因交换--变异 #
        newGene = gene[:]  # 产生一个新的基因序列，以免变异的时候影响父种群
        newGene[index1], newGene[index2] = newGene[index2], newGene[index1]
        self.mutationCount += 1
        return newGene

    def getOne(self):
        """选择一个个体"""
        r = random.uniform(0, self.bounds)
        for life in self.lives:
            r -= life.score
            if r <= 0:
                return life

        raise Exception("选择错误", self.bounds)

    def newChild(self):
        """产生新的后代"""
        parent1 = self.getOne()
        rate = random.random()

        # 按概率交叉 #
        if rate < self.croessRate:
            # 交叉 #
            parent2 = self.getOne()
            gene = self.cross(parent1, parent2)
        else:
            gene = parent1.gene

        # 按概率突变 #
        rate = random.random()
        if rate < self.mutationRate:
            gene = self.mutation(gene)

        return Life(gene)

    def next(self):
        """产生下一代"""
        self.judge()
        newLives = []
        # newLives.append(self.best)  # 把最好的个体加入下一代 #
        newLives.append(self.best)  # 把最好的个体加入下一代 #                 ####

        while len(newLives) < self.lifeCount:
            newLives.append(self.newChild())
        self.lives = newLives
        self.generation += 1

Life.py

# -*- encoding: utf-8 -*-

SCORE_NONE = -1

class Life(object):
      """个体类"""
      def __init__(self, aGene=None):
            self.gene = aGene
            self.score = SCORE_NONE  # 初始化生命值 #