数据文档
背景描述
随着年龄增长,脱发成为许多人关注的健康问题之一。头发的丰盈与否不仅影响着外貌,更与个体的健康状态息息相关。
本数据集汇集了各种可能导致脱发的因素,包括遗传因素、荷尔蒙变化、医疗状况、药物治疗、营养缺乏、心理压力等。
通过数据探索分析,可以深入挖掘这些因素与脱发之间的潜在关联,从而为个体健康管理、医疗干预以及相关产业的发展提供有益参考。
数据来源
Hair Health Prediction | Kaggle
相关代码:
import multiprocessing
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.decomposition import PCA
from multiprocessing import Pool
from tensorflow.keras import backend as K
# 定义神经网络模型
def create_cnn_model(input_shape,learning_rate):
model = tf.keras.Sequential([
tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
tf.keras.layers.MaxPooling1D(2),
tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
tf.keras.layers.MaxPooling1D(2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
return model
# 全局定义训练函数
def train_model(model):
model.fit(X_train, y_train, epochs=20, verbose=0)
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
return accuracy, model
# 定义PBT算法
def run_pbt(population_size, generations):
# 设置随机种子
np.random.seed(42)
tf.random.set_seed(42)
# 初始化超参数
learning_rates = np.random.uniform(0.0001, 0.0001, size=population_size)
population = [create_cnn_model(input_shape,learning_rates[i]) for i in range(population_size)]
best_model = None
num_processes = multiprocessing.cpu_count()
pool = multiprocessing.Pool(num_processes)
complex_prarms= []
try:
for gen in range(generations):
print(f"Generation {gen + 1}")
# 在种群中进行迭代
# 修改PBT的训练方法
accuracies = []
results = []
complex_prarm= []
for i, model in enumerate(population):
result = pool.apply_async(train_model, (model))
results.append(result)
for result in results:
result.wait()
for i in range(population_size):
loss, accuracy = population[i].evaluate(X_val, y_val, verbose=0)
print(f"Model {i + 1}: loss={loss}, accuracy={accuracy}")
accuracies.append(accuracy)
sorted_indices = np.argsort(accuracies)[::-1]
# Exploit策略
exploit_indices_front = sorted_indices[:int(population_size * 0.3)]
exploit_indices_back = sorted_indices[-int(population_size * 0.3):]
best_model = population[sorted_indices[0]]
# 探索策略
inter_lr = []
for idx in exploit_indices_front:
# 对前exploit_ratio的模型进行探索
#inter_lr.append(population[idx].optimizer.learning_rate * np.random.uniform(0.8, 1.2))
inter_lr.append(learning_rates[idx] * np.random.uniform(0.8, 1.2))
complex_prarm_ = {"idx": idx, "accuracy": accuracies[idx],"lr":learning_rates[idx]}
complex_prarm.append(complex_prarm_)
print(complex_prarm)
if complex_prarms is None:
complex_prarms = complex_prarm.copy()
else:
# 合并两个列表
all_params = complex_prarm + complex_prarms
# 对合并后的列表按照 accuracy 进行排序
sorted_params = sorted(all_params, key=lambda x: x['accuracy'], reverse=True)
# 取出排序后的前 len(complex_prarm) 个元素
top_n_params = sorted_params[:len(complex_prarm)]
# 提取前 len(complex_prarm) 个元素的 lr、idx 和 accuracy,并保存到新的列表中
new_complex_prarms = [{ 'idx': item['idx'], 'accuracy': item['accuracy'],'lr': item['lr']} for item in
top_n_params]
# 将新的列表复制给 complex_prarms
complex_prarms = new_complex_prarms
print(complex_prarms)
# 提取 'lr' 键对应的数值并组成一个数组
lr_values = [item['lr'] for item in complex_prarms]
# 替换后exploit_ratio的模型
for idx in exploit_indices_back:
source_idx = np.random.choice(exploit_indices_front)
population[idx].set_weights(population[source_idx].get_weights())
population[idx].optimizer.learning_rate = random.choice(lr_values)
# 获取模型参数数量
finally:
pool.close()
pool.join()
return best_model
# 读取数据
data = pd.read_csv('/home/chenjz/python_programe/MO-PBT-main/脱发数据/Predict Hair Fall.csv')
# 处理非数字特征
binary_features = ['Genetics', 'Hormonal Changes', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
'Weight Loss ']
ordinal_features = ['Stress']
categorical_features = ['Medical Conditions', 'Medications & Treatments', 'Nutritional Deficiencies ']
# 二进制编码
for feature in binary_features:
data[feature] = data[feature].map({'yes': 1, 'no': 0})
# 标签编码或独热编码
for feature in ordinal_features:
data[feature] = data[feature].map({'L': 0, 'M': 1, 'H': 2})
data = pd.get_dummies(data, columns=categorical_features)
# 处理缺失值(这里简单地用0填充,实际情况可能需要更复杂的处理)
data = data.fillna(0)
# 划分数据集
X = data.drop(columns=['Id', 'Hair Loss'])
y = data['Hair Loss']
# 将数据集划分为训练集、验证集和测试集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# 打印每个数据集的大小
print("训练集大小:", X_train.shape[0])
print("验证集大小:", X_val.shape[0])
print("测试集大小:", X_test.shape[0])
# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 应用主成分分析
pca = PCA()
pca.fit(X_scaled)
# 解释方差
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)
# 选择主成分数量
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
print("cumulative_variance_ratio:", cumulative_variance_ratio)
# 设定阈值,例如0.8
threshold = 0.95
n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1
print("Number of components to retain:", n_components)
# 转换数据
pca = PCA(n_components=n_components)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_val = pca.transform(X_val)
print(X_train.shape)
print(X_test.shape)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
# Assume X_train and X_test are 3D tensors (samples, timesteps, features)
# For tabular data, it might be (samples, features, 1) since there's no time dimension.
# Adjust input shape accordingly based on your data.
input_shape = X_train.shape[1:]
print(input_shape)
multiprocessing.freeze_support()
# 运行PBT算法
best_model = run_pbt(population_size=20, generations=8)
best_model.summary()
# 展示最好的模型在测试集上的性能
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
print(f"Best Model Test Accuracy: {test_accuracy}")
作者才做数据的预测时,简单运用过其它传统的统计分析模型,诸如SVM,决策树,逻辑回归和随机森林等,但效果都比较一般(详细可参看其他作者的模型分析:脱发数据集 - Heywhale.com),普遍准确率在0.5这里。而运用卷积神经网络配合PBT算法进行超参数优化后,准确率最多能提升到0.6。至于进一步的优化,或许可以朝着搭建更复杂的模型、增加所要优化的超参数种类,以及改进超参数的优化策略,有兴趣的可以尝试一下。