要求:计算各个特征的标准差,取标准差比较小的特征,
然后pca处理,取6-8个特征来预测结果
使用woa-dt模型
B战讲解视频:【鲸鱼优化算法优化DecisionTreeRegressor-Python】 https://www.bilibili.com/video/BV1trYie4EtE/?share_source=copy_web&vd_source=928f967ecbd95c9b874da5eb502e8c69
数据预处理
查看数据,并删除异常值
import pandas as pd
import numpy as np
data = pd.read_csv('F:/待做/项目/woa-df机器学习/python/data.csv')
pd.set_option('display.max_columns',18)
print(data.describe())
# 基于箱线图方法的异常值识别
def detect_outliers_zscore(data, threshold=4.5):
z_scores = np.abs((data - data.mean()) / data.std()) # 计算Z-Scores
outlier_indexes = np.unique(np.where(z_scores > threshold)[0]) # 找到超过阈值的Z-Scores的行索引
return list(outlier_indexes)
# 使用Z-Score方法识别异常值
outliers = detect_outliers_zscore(data)
print(outliers) # 输出异常值的行索引
print(len(outliers))
计算各个特征的标准差,取标准差比较小的特征
#计算各个特征的标准差
features = data.drop(['X17','index'],axis=1)
std_devs = features.std()
print("各个特征的标准差:", std_devs)
std_devs_desc = std_devs.sort_values(ascending=False)
print(std_devs_desc)
pca处理
features = features.drop(['X6','X7'],axis=1)
target = data['X17']
#初始化PCA对象
pca = PCA(n_components=8)
x_pca = pca.fit_transform(features)
woa-df
# 评估函数,计算决策树回归模型在给定超参数下的表现
def evaluate(max_depth, min_samples_split, min_samples_leaf, x, y, kf):
params = {
'max_depth': int(max_depth) if max_depth else None,
'min_samples_split': int(min_samples_split),
'min_samples_leaf': int(min_samples_leaf),
}
dt = DecisionTreeRegressor(**params)
mse_scores = -cross_val_score(dt, x, y, cv=kf, scoring='neg_mean_squared_error')
return np.mean(mse_scores)
# 鲸鱼优化算法的初始化,为决策树参数调整范围
def initialization_WOA(popsize, dim_limits):
pop = np.zeros([popsize, len(dim_limits)])
for i in range(popsize):
for j in range(len(dim_limits)):
pop[i, j] = np.random.uniform(low=dim_limits[j][0], high=dim_limits[j][1])
return pop
# WOA算法主体
def whale_optimization_algorithm(x, y, pop_size, max_iter, dim_limits, kf):
# 初始化种群
whale_pop = initialization_WOA(pop_size, dim_limits)
fitness = np.array([evaluate(whale[0], whale[1], whale[2], x, y, kf) for whale in whale_pop])
# 最佳适应度和位置
best_fitness = np.min(fitness)
best_position = whale_pop[np.argmin(fitness)]
for iteration in range(max_iter):
a = 2 - iteration * (2 / max_iter)
leader_index = np.argmin(fitness)
leader = whale_pop[leader_index]
for i in range(pop_size):
r1 = np.random.random()
r2 = np.random.random()
A = 2 * a * r1 - a
C = 2 * r2
p = np.random.random()
for j in range(whale_pop.shape[1]):
if p < 0.5:
if np.abs(A) > 1:
rand_index = np.random.randint(0, pop_size)
D_rand = np.abs(C * whale_pop[rand_index] - whale_pop[i])
whale_pop[i] = whale_pop[rand_index] - A * D_rand
else:
D_Leader = np.abs(C * leader[j] - whale_pop[i][j])
whale_pop[i][j] = leader[j] - A * D_Leader
else:
distance2Leader = np.abs(leader[j] - whale_pop[i][j])
whale_pop[i][j] = distance2Leader * np.exp(a * iteration) * np.cos(2 * np.pi * iteration) + leader[
j]
# 保持解在搜索空间内
for j in range(whale_pop.shape[1]):
whale_pop[i][j] = np.clip(whale_pop[i][j], dim_limits[j][0], dim_limits[j][1])
# 计算新的适应度
new_fitness = evaluate(whale_pop[i][0], whale_pop[i][1], whale_pop[i][2], x, y, kf)
if new_fitness < fitness[i]: # 有改进则更新适应度和位置
fitness[i] = new_fitness
if new_fitness < best_fitness:
best_fitness = new_fitness
best_position = whale_pop[i]
return best_fitness, best_position
# 决策树超参数的取值范围
dim_limits = [(1, 200), (2, 40), (1,40)]
# 执行鲸鱼优化算法来寻找最佳决策树参数
pop_size, max_iter = 40, 20
kf = KFold(n_splits=10, shuffle=True, random_state=1) # 定义 KFold 对象
best_fitness, best_params = whale_optimization_algorithm(features, target, pop_size, max_iter, dim_limits, kf)
参考资料:【【智能优化算法】鲸鱼优化算法基础与实战,附原理讲义及实战代码,数学建模国赛美赛必看】 https://www.bilibili.com/video/BV1gU411S7TL/?share_source=copy_web&vd_source=928f967ecbd95c9b874da5eb502e8c69