数据集
完整数据集请到Github仓库下载。
# 数据存放在 people.cvs 文件中
# 性别:0代表女,1代表男,是否喜欢运动:0代表不喜欢,1代表喜欢
# 性别,身高,体重,鞋码,50米成绩,肺活量,是否喜欢运动
0,164,47,38,9,2500,1
0,160,46,38,9,2500,1
0,165,60,39,7.4,2400,1
0,168,44,38,9,4000,0
0,167,49,38,6.9,3800,1
0,175,50,38,9,3800,0
0,172,43,36,7.9,2400,1
0,158,49,35,7.9,2400,1
0,162,46,36,9.5,2800,0
0,158,50,37,11,2500,0
0,165,48,36,8,2500,1
0,172,57,38,9.25,2800,0
...
遗传算法特征选择,SVM分类
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 16 17:51:47 2020
@author: Jiaqingwang
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc ,confusion_matrix #计算roc和auc
# 数据集归一化
def maxminnorm(array):
array = np.array(array)
maxcols=array.max(axis=0)
mincols=array.min(axis=0)
data_shape = array.shape
data_rows = data_shape[0]
data_cols = data_shape[1]
t = np.empty((data_rows,data_cols))
for i in range(data_cols):
t[:,i] = (array[:,i]-mincols[i])/(maxcols[i]-mincols[i])
return np.mat(t)
data = pd.read_csv('dataset/people.csv',header=None,sep=',')
data1 = data.iloc[0:208,1:7]
data1 = maxminnorm(data1)
data2 = np.mat(data1)
X1 = data2[0:42,1:7]
X2 = data2[43:208,1:7]
m1 = np.mean(X1,axis = 0) # m1为第一类均值向量
m2 = np.mean(X2,axis = 0) # m2为第二类均值向量
m = np.mean(data2,axis = 0) # m为总体均值向量
pc = 0.02 # pc为变异的概率
t = 100 #遗传算法迭代的次数
#n = 50 #种群的个体数,要求大于20以保证具有随机性
individuals_Number = 30 #种群的个体数,要求大于20以保证具有随机性
# 遗传算法
def GA(d):
population = np.zeros((individuals_Number,6)) # 初始化种群
for i in range(individuals_Number): # 定义种群的个体数为 n
a = np.zeros(6-d)
b = np.ones(d) # 将选择的d维特征定义为个体c中的1
c = np.append(a,b)
c = (np.random.permutation(c.T)).T # 随机生成一个d维的个体
population[i] = c # 初代的种群为 population,共有individuals_Number个个体
# 遗传算法的迭代次数为t
fitness_change = np.zeros(t)
for i in range(t):
fitness = np.zeros(individuals_Number) # fitness为每一个个体的适应度值
for j in range(individuals_Number):
fitness[j] = Jd(population[j]) # 计算每一个体的适应度值
population = selection(population,fitness) # 通过概率选择产生新一代的种群
population = crossover(population) # 通过交叉产生新的个体
population = mutation(population) # 通过变异产生新个体
fitness_change[i] = max(fitness) #找出每一代的适应度最大的染色体的适应度值
# 随着迭代的进行,每个个体的适应度值应该会不断增加,所以总的适应度值fitness求平均应该会变大
best_fitness = max(fitness)
best_people = population[fitness.argmax()]
return best_people,best_fitness,fitness_change,population
#轮盘赌选择
def selection(population,fitness):
fitness_sum = np.zeros(individuals_Number)
for i in range(individuals_Number):
if i==0:
fitness_sum[i] = fitness[i]
else:
fitness_sum[i] = fitness[i] + fitness_sum[i-1]
for i in range(individuals_Number):
fitness_sum[i] = fitness_sum[i] / sum(fitness)
#选择新的种群
population_new = np.zeros((individuals_Number,6))
for i in range(individuals_Number):
rand = np.random.uniform(0,1)
for j in range(individuals_Number):
if j==0:
if rand<=fitness_sum[j]:
population_new[i] = population[j]
else:
if fitness_sum[j-1]<rand and rand<=fitness_sum[j]:
population_new[i] = population[j]
return population_new
#交叉操作
def crossover(population):
father = population[0:10,:]
mother = population[10:,:]
np.random.shuffle(father) # 将父代个体按行打乱以随机配对
np.random.shuffle(mother)
for i in range(10):
father_1 = father[i]
mother_1 = mother[i]
one_zero = []
zero_one = []
for j in range(6):
if father_1[j]==1 and mother_1[j]==0:
one_zero.append(j)
if father_1[j]==0 and mother_1[j]==1:
zero_one.append(j)
length1 = len(one_zero)
length2 = len(zero_one)
length = max(length1,length2)
half_length = int(length/2) #half_length为交叉的位数
for k in range(half_length): #进行交叉操作
p = one_zero[k]
q = zero_one[k]
father_1[p]=0
mother_1[p]=1
father_1[q]=1
mother_1[q]=0
father[i] = father_1 #将交叉后的个体替换原来的个体
mother[i] = mother_1
population = np.append(father,mother,axis=0)
return population
#变异操作
def mutation(population):
for i in range(individuals_Number):
c = np.random.uniform(0,1)
if c<=pc:
mutation_s = population[i]
zero = [] # zero存的是变异个体中第几个数为0
one = [] # one存的是变异个体中第几个数为1
for j in range(6):
if mutation_s[j]==0:
zero.append(j)
else:
one.append(j)
a = np.random.randint(0,len(zero)) # e是随机选择由0变为1的位置
b = np.random.randint(0,len(one)) # f是随机选择由1变为0的位置
e = zero[a]
f = one[b]
mutation_s[e] = 1
mutation_s[f] = 0
population[i] = mutation_s
return population
#个体适应度函数 Jd(x),x是d维特征向量(1*6维的行向量,1表示选择该特征)
def Jd(x):
#从特征向量x中提取出相应的特征
Feature = np.zeros(d) #数组Feature用来存 x选择的是哪d个特征
k = 0
for i in range(6):
if x[i] == 1:
Feature[k] = i
k+=1
#将4个特征从iris2数据集中取出重组成一个150*d的矩阵iris3
data3 = np.zeros((208,1))
for i in range(d):
p = Feature[i]
p = p.astype(int)
q = data2[:,p]
q = q.reshape(208,1)
data3 = np.append(data3,q,axis=1)
data3 = np.delete(data3,0,axis=1)
#求类间离散度矩阵Sb
data3_1 = data3[0:42,:] #iris数据集分为三类
data3_2 = data3[43:208,:]
m = np.mean(data3,axis=0) #总体均值向量
m1 = np.mean(data3_1,axis=0) #第一类的均值向量
m2 = np.mean(data3_2,axis=0) #第二类的均值向量
m = m.reshape(d,1) #将均值向量转换为列向量以便于计算
m1 = m1.reshape(d,1)
m2 = m2.reshape(d,1)
Sb = ((m1 - m).dot((m1 - m).T) + (m2 - m).dot((m2 - m).T))/2 #除以类别个数
#求类内离散度矩阵Sw
S1 = np.zeros((d,d))
S2 = np.zeros((d,d))
for i in range(42):
S1 += (data3_1[i].reshape(d,1)-m1).dot((data3_1[i].reshape(d,1)-m1).T)
S1 = S1/42
for i in range(165):
S2 += (data3_2[i].reshape(d,1)-m2).dot((data3_2[i].reshape(d,1)-m2).T)
S2 = S2/165
Sw = (S1 + S2)/2
#计算个体适应度函数 Jd(x)
J1 = np.trace(Sb)
J2 = np.trace(Sw)
Jd = J1/J2
return Jd
def svmfun(choice,d):
# 数据处理
choice = np.array(choice)
choice = choice.astype("int")
svm_data = pd.read_csv('dataset/people.csv',header=None,sep=',')
svm_data = np.array(svm_data)
np.random.shuffle(svm_data)
labels = svm_data[...,0]
dataset = svm_data[...,choice]
dataset = maxminnorm(dataset)
x_train, x_test, y_train, y_test = train_test_split(dataset, labels, test_size = 0.2)
# 做4折交叉验证
KF = KFold(n_splits=4)
i = 0
for train_index, test_index in KF.split(dataset):
i += 1
print("="*25,"维数:%d"%(d),",","第 %d 次交叉验证:"%(i),"="*25)
x_train, x_test = dataset[train_index], dataset[test_index]
y_train, y_test = labels[train_index], labels[test_index]
# 核函数:
# linear:gamma=0.1,C=10
# rbf:C=10,gamma=0.1
# sigmoid:C=10,gamma=0.1
# poly:C=10,gamma=10
clf= SVC(C=10, kernel='linear', degree=3, gamma=0.1,
coef0=0.0, shrinking=True, probability=False,
tol=1e-3, cache_size=200, class_weight=None,
verbose=False, max_iter=-1,decision_function_shape='ovr',
random_state=None)
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
#print(y_predict)
print('AUC')
print(roc_auc_score(y_test,y_predict))
#print(recall_score(y_test, y_predict, average=None))
print('ACC')
print(accuracy_score(y_test, y_predict))
fpr,tpr,threshold = roc_curve(y_test,y_predict) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
# 计算混淆矩阵
matrix = confusion_matrix(y_test,y_predict)
#print("matrix:")
TP = matrix[0][0]
FP = matrix[0][1]
FN = matrix[1][0]
TN = matrix[1][1]
SE = TP/(TP+FN)
SP = TN/(TN+FP)
print("SE:",SE)
print("SP:",SP)
#plt.figure()
lw = 2
plt.figure(figsize=(7,7))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Linear-SVM')
plt.legend(loc="lower right")
plt.show()
if __name__ == '__main__':
# best_d = np.zeros(d) # judge存的是每一个维数的最优适应度
# fitness_change是遗传算法在迭代过程中适应度变化
# best是每一维数迭代到最后的最优的适应度,用于比较
for d in range(1,6):
best_people,best_fitness,fitness_change,best_population = GA(d)
choice = np.zeros(d)
k = 0
print("*"*30,"维数 %d 时最优特征:"%(d),"*"*30)
print("在取%d维的时候,通过遗传算法得出的最优适应度值为:%.6f"%(d,best_fitness))
print("选出的最优染色体为:")
print(best_people)
for j in range(6):
if best_people[j] == 1:
choice[k]=j
k+=1
print("选出的最优特征为:")
print(choice)
svmfun(choice, d)