基于遗传算法特征选择以及SVM算法实现男女生分类器

最新推荐文章于 2024-07-09 10:54:05 发布

乐宝不是酒

最新推荐文章于 2024-07-09 10:54:05 发布

阅读量1.7k

点赞数

分类专栏：算法文章标签：机器学习

本文链接：https://blog.csdn.net/qq_20496483/article/details/109841792

版权

算法专栏收录该内容

2 篇文章 1 订阅

订阅专栏

数据集

完整数据集请到Github仓库下载。

# 数据存放在 people.cvs 文件中
# 性别：0代表女，1代表男，是否喜欢运动：0代表不喜欢，1代表喜欢
# 性别，身高，体重，鞋码，50米成绩，肺活量，是否喜欢运动
0,164,47,38,9,2500,1
0,160,46,38,9,2500,1
0,165,60,39,7.4,2400,1
0,168,44,38,9,4000,0
0,167,49,38,6.9,3800,1
0,175,50,38,9,3800,0
0,172,43,36,7.9,2400,1
0,158,49,35,7.9,2400,1
0,162,46,36,9.5,2800,0
0,158,50,37,11,2500,0
0,165,48,36,8,2500,1
0,172,57,38,9.25,2800,0
...

遗传算法特征选择，SVM分类

# -*- coding: utf-8 -*-
"""
Created on Mon Nov 16 17:51:47 2020
@author: Jiaqingwang
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc ,confusion_matrix #计算roc和auc
# 数据集归一化
def maxminnorm(array):
    array = np.array(array)
    maxcols=array.max(axis=0)
    mincols=array.min(axis=0)
    data_shape = array.shape
    data_rows = data_shape[0]
    data_cols = data_shape[1]
    t = np.empty((data_rows,data_cols))
    for i in range(data_cols):
        t[:,i] = (array[:,i]-mincols[i])/(maxcols[i]-mincols[i])
    return np.mat(t)

data = pd.read_csv('dataset/people.csv',header=None,sep=',')
data1 = data.iloc[0:208,1:7]
data1 = maxminnorm(data1)
data2 = np.mat(data1)
X1 = data2[0:42,1:7]
X2 = data2[43:208,1:7]
m1 = np.mean(X1,axis = 0)           # m1为第一类均值向量
m2 = np.mean(X2,axis = 0)           # m2为第二类均值向量
m = np.mean(data2,axis = 0)         # m为总体均值向量

pc = 0.02      # pc为变异的概率
t = 100       #遗传算法迭代的次数
#n = 50        #种群的个体数,要求大于20以保证具有随机性
individuals_Number = 30  #种群的个体数,要求大于20以保证具有随机性


# 遗传算法
def GA(d):
    population = np.zeros((individuals_Number,6))      # 初始化种群
    for i in range(individuals_Number):                # 定义种群的个体数为 n
        a = np.zeros(6-d)
        b = np.ones(d)                # 将选择的d维特征定义为个体c中的1
        c = np.append(a,b)
        c = (np.random.permutation(c.T)).T    # 随机生成一个d维的个体
        population[i] = c             # 初代的种群为 population，共有individuals_Number个个体
    
    # 遗传算法的迭代次数为t
    fitness_change = np.zeros(t)
    for i in range(t):
        fitness = np.zeros(individuals_Number)      # fitness为每一个个体的适应度值
        for j in range(individuals_Number):
            fitness[j] = Jd(population[j])          # 计算每一个体的适应度值   
        population = selection(population,fitness)  # 通过概率选择产生新一代的种群
        population = crossover(population)          # 通过交叉产生新的个体
        population = mutation(population)           # 通过变异产生新个体
        fitness_change[i] = max(fitness)      #找出每一代的适应度最大的染色体的适应度值
        
        
    # 随着迭代的进行，每个个体的适应度值应该会不断增加，所以总的适应度值fitness求平均应该会变大
    best_fitness = max(fitness)
    best_people = population[fitness.argmax()]
    return best_people,best_fitness,fitness_change,population
    

#轮盘赌选择
def selection(population,fitness):
    fitness_sum = np.zeros(individuals_Number)
    for i in range(individuals_Number):
        if i==0:
            fitness_sum[i] = fitness[i]
        else:
            fitness_sum[i] = fitness[i] + fitness_sum[i-1]
    for i in range(individuals_Number):
        fitness_sum[i] = fitness_sum[i] / sum(fitness)
    
    #选择新的种群
    population_new = np.zeros((individuals_Number,6))
    for i in range(individuals_Number):
        rand = np.random.uniform(0,1)
        for j in range(individuals_Number):
            if j==0:
                if rand<=fitness_sum[j]:
                    population_new[i] = population[j]
            else:
                if fitness_sum[j-1]<rand and rand<=fitness_sum[j]:
                    population_new[i] = population[j]
    return population_new
                

#交叉操作
def crossover(population):
    father = population[0:10,:]
    mother = population[10:,:]
    np.random.shuffle(father)       # 将父代个体按行打乱以随机配对
    np.random.shuffle(mother)
    for i in range(10):
        father_1 = father[i]
        mother_1 = mother[i]
        one_zero = []
        zero_one = []
        for j in range(6):
            if father_1[j]==1 and mother_1[j]==0:
                one_zero.append(j)
            if father_1[j]==0 and mother_1[j]==1:
                zero_one.append(j)
        length1 = len(one_zero)
        length2 = len(zero_one)
        length = max(length1,length2)
        half_length = int(length/2)        #half_length为交叉的位数 
        for k in range(half_length):       #进行交叉操作
            p = one_zero[k]
            q = zero_one[k]
            father_1[p]=0
            mother_1[p]=1
            father_1[q]=1
            mother_1[q]=0
        father[i] = father_1               #将交叉后的个体替换原来的个体
        mother[i] = mother_1
    population = np.append(father,mother,axis=0)
    return population
                
            
    
#变异操作
def mutation(population):
    for i in range(individuals_Number):
        c = np.random.uniform(0,1)
        if c<=pc:
            mutation_s = population[i]
            zero = []                           # zero存的是变异个体中第几个数为0
            one = []                            # one存的是变异个体中第几个数为1
            for j in range(6):
                if mutation_s[j]==0:
                    zero.append(j)
                else:
                    one.append(j)
            a = np.random.randint(0,len(zero))    # e是随机选择由0变为1的位置
            b = np.random.randint(0,len(one))     # f是随机选择由1变为0的位置
            e = zero[a]
            f = one[b]
            mutation_s[e] = 1
            mutation_s[f] = 0
            population[i] = mutation_s
            
    return population


#个体适应度函数 Jd(x)，x是d维特征向量(1*6维的行向量,1表示选择该特征)
def Jd(x):
    #从特征向量x中提取出相应的特征
    Feature = np.zeros(d)        #数组Feature用来存 x选择的是哪d个特征
    k = 0
    for i in range(6):
        if x[i] == 1:
            Feature[k] = i
            k+=1
    
    #将4个特征从iris2数据集中取出重组成一个150*d的矩阵iris3
    data3 = np.zeros((208,1))
    for i in range(d):
        p = Feature[i]
        p = p.astype(int)
        q = data2[:,p]
        q = q.reshape(208,1)
        data3 = np.append(data3,q,axis=1)
    data3 = np.delete(data3,0,axis=1)
    
    #求类间离散度矩阵Sb
    data3_1 = data3[0:42,:]        #iris数据集分为三类
    data3_2 = data3[43:208,:]

    m = np.mean(data3,axis=0)       #总体均值向量
    m1 = np.mean(data3_1,axis=0)    #第一类的均值向量
    m2 = np.mean(data3_2,axis=0)    #第二类的均值向量

    m = m.reshape(d,1)               #将均值向量转换为列向量以便于计算
    m1 = m1.reshape(d,1)
    m2 = m2.reshape(d,1)

    Sb = ((m1 - m).dot((m1 - m).T) + (m2 - m).dot((m2 - m).T))/2 #除以类别个数
    
    #求类内离散度矩阵Sw
    S1 = np.zeros((d,d))
    S2 = np.zeros((d,d))
    for i in range(42):
        S1 += (data3_1[i].reshape(d,1)-m1).dot((data3_1[i].reshape(d,1)-m1).T)
    S1 = S1/42
    for i in range(165):
        S2 += (data3_2[i].reshape(d,1)-m2).dot((data3_2[i].reshape(d,1)-m2).T)
    S2 = S2/165
    Sw = (S1 + S2)/2
    
    #计算个体适应度函数 Jd(x)
    J1 = np.trace(Sb)
    J2 = np.trace(Sw)
    Jd = J1/J2
    
    return Jd

def svmfun(choice,d):
    # 数据处理
    choice = np.array(choice)
    choice = choice.astype("int")
    svm_data = pd.read_csv('dataset/people.csv',header=None,sep=',')
    svm_data = np.array(svm_data)
    np.random.shuffle(svm_data)
    labels = svm_data[...,0]
    dataset = svm_data[...,choice]
    dataset = maxminnorm(dataset)
    x_train, x_test, y_train, y_test = train_test_split(dataset, labels, test_size = 0.2)
    # 做4折交叉验证
    KF = KFold(n_splits=4)
    i = 0
    for train_index, test_index in KF.split(dataset):
        i += 1
        print("="*25,"维数:%d"%(d),",","第 %d 次交叉验证:"%(i),"="*25)
        x_train, x_test = dataset[train_index], dataset[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        # 核函数:
        # linear:gamma=0.1,C=10
        # rbf:C=10,gamma=0.1
        # sigmoid:C=10,gamma=0.1
        # poly:C=10,gamma=10
        clf= SVC(C=10, kernel='linear', degree=3, gamma=0.1,
        coef0=0.0, shrinking=True, probability=False,
        tol=1e-3, cache_size=200, class_weight=None,
        verbose=False, max_iter=-1,decision_function_shape='ovr',
        random_state=None)
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)
        #print(y_predict)
        print('AUC')
        print(roc_auc_score(y_test,y_predict))
        #print(recall_score(y_test, y_predict, average=None))
        print('ACC')
        print(accuracy_score(y_test, y_predict))
        fpr,tpr,threshold = roc_curve(y_test,y_predict) ###计算真正率和假正率
        roc_auc = auc(fpr,tpr) ###计算auc的值
        # 计算混淆矩阵
        matrix = confusion_matrix(y_test,y_predict)
        #print("matrix:")
        TP = matrix[0][0]
        FP = matrix[0][1]
        FN = matrix[1][0]
        TN = matrix[1][1]
        SE = TP/(TP+FN)
        SP = TN/(TN+FP)
        print("SE:",SE)
        print("SP:",SP)

    #plt.figure()
        lw = 2
        plt.figure(figsize=(7,7))
        plt.plot(fpr, tpr, color='darkorange',
            lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Linear-SVM')
        plt.legend(loc="lower right")
        plt.show()

if __name__ == '__main__':
    
    # best_d = np.zeros(d)          # judge存的是每一个维数的最优适应度
    
    # fitness_change是遗传算法在迭代过程中适应度变化
    # best是每一维数迭代到最后的最优的适应度，用于比较

    for d in range(1,6):
        best_people,best_fitness,fitness_change,best_population = GA(d)
        choice = np.zeros(d)
        k = 0
        print("*"*30,"维数 %d 时最优特征："%(d),"*"*30)
        print("在取%d维的时候，通过遗传算法得出的最优适应度值为：%.6f"%(d,best_fitness))
        print("选出的最优染色体为：")
        print(best_people)
        for j in range(6):
            if best_people[j] == 1:
                choice[k]=j
                k+=1
        print("选出的最优特征为：")
        print(choice)
        svmfun(choice, d)