脱发数据集(基于PBT算法的CNN神经网络实现)

数据文档

背景描述

随着年龄增长,脱发成为许多人关注的健康问题之一。头发的丰盈与否不仅影响着外貌,更与个体的健康状态息息相关。
本数据集汇集了各种可能导致脱发的因素,包括遗传因素、荷尔蒙变化、医疗状况、药物治疗、营养缺乏、心理压力等。
通过数据探索分析,可以深入挖掘这些因素与脱发之间的潜在关联,从而为个体健康管理、医疗干预以及相关产业的发展提供有益参考。

数据来源

Hair Health Prediction | Kaggle

相关代码:

import multiprocessing
import random

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.decomposition import PCA
from multiprocessing import Pool
from tensorflow.keras import backend as K


# 定义神经网络模型
def create_cnn_model(input_shape,learning_rate):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 全局定义训练函数
def train_model(model):
    model.fit(X_train, y_train, epochs=20, verbose=0)
    loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
    return accuracy, model

# 定义PBT算法
def run_pbt(population_size, generations):
    # 设置随机种子
    np.random.seed(42)
    tf.random.set_seed(42)
    # 初始化超参数
    learning_rates = np.random.uniform(0.0001, 0.0001, size=population_size)

    population = [create_cnn_model(input_shape,learning_rates[i]) for i in range(population_size)]
    best_model = None
    num_processes = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(num_processes)

    complex_prarms= []
    try:

        for gen in range(generations):
            print(f"Generation {gen + 1}")
            # 在种群中进行迭代
            # 修改PBT的训练方法
            accuracies = []
            results = []
            complex_prarm= []
            for i, model in enumerate(population):
                result = pool.apply_async(train_model, (model))
                results.append(result)
            for result in results:
                result.wait()
            for i in range(population_size):
                loss, accuracy = population[i].evaluate(X_val, y_val, verbose=0)
                print(f"Model {i + 1}: loss={loss}, accuracy={accuracy}")
                accuracies.append(accuracy)
            sorted_indices = np.argsort(accuracies)[::-1]
            # Exploit策略
            exploit_indices_front = sorted_indices[:int(population_size * 0.3)]
            exploit_indices_back = sorted_indices[-int(population_size * 0.3):]
            best_model = population[sorted_indices[0]]
            # 探索策略
            inter_lr = []

            for idx in exploit_indices_front:
                # 对前exploit_ratio的模型进行探索
                #inter_lr.append(population[idx].optimizer.learning_rate * np.random.uniform(0.8, 1.2))
                inter_lr.append(learning_rates[idx] * np.random.uniform(0.8, 1.2))
                complex_prarm_ = {"idx": idx, "accuracy": accuracies[idx],"lr":learning_rates[idx]}
                complex_prarm.append(complex_prarm_)
            print(complex_prarm)
            if complex_prarms is None:
                complex_prarms = complex_prarm.copy()
            else:
                # 合并两个列表
                all_params = complex_prarm + complex_prarms
                # 对合并后的列表按照 accuracy 进行排序
                sorted_params = sorted(all_params, key=lambda x: x['accuracy'], reverse=True)
                # 取出排序后的前 len(complex_prarm) 个元素
                top_n_params = sorted_params[:len(complex_prarm)]
                # 提取前 len(complex_prarm) 个元素的 lr、idx 和 accuracy,并保存到新的列表中
                new_complex_prarms = [{ 'idx': item['idx'], 'accuracy': item['accuracy'],'lr': item['lr']} for item in
                                      top_n_params]
                # 将新的列表复制给 complex_prarms
                complex_prarms = new_complex_prarms
            print(complex_prarms)
            # 提取 'lr' 键对应的数值并组成一个数组
            lr_values = [item['lr'] for item in complex_prarms]

            # 替换后exploit_ratio的模型
            for idx in exploit_indices_back:
                source_idx = np.random.choice(exploit_indices_front)
                population[idx].set_weights(population[source_idx].get_weights())
                population[idx].optimizer.learning_rate = random.choice(lr_values)
                # 获取模型参数数量





             

    finally:
        pool.close()
        pool.join()



    return best_model


# 读取数据
data = pd.read_csv('/home/chenjz/python_programe/MO-PBT-main/脱发数据/Predict Hair Fall.csv')

# 处理非数字特征
binary_features = ['Genetics', 'Hormonal Changes', 'Poor Hair Care Habits ', 'Environmental Factors', 'Smoking',
                   'Weight Loss ']
ordinal_features = ['Stress']
categorical_features = ['Medical Conditions', 'Medications & Treatments', 'Nutritional Deficiencies ']

# 二进制编码
for feature in binary_features:
    data[feature] = data[feature].map({'yes': 1, 'no': 0})

# 标签编码或独热编码
for feature in ordinal_features:
    data[feature] = data[feature].map({'L': 0, 'M': 1, 'H': 2})

data = pd.get_dummies(data, columns=categorical_features)

# 处理缺失值(这里简单地用0填充,实际情况可能需要更复杂的处理)
data = data.fillna(0)

# 划分数据集
X = data.drop(columns=['Id', 'Hair Loss'])
y = data['Hair Loss']
# 将数据集划分为训练集、验证集和测试集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# 打印每个数据集的大小
print("训练集大小:", X_train.shape[0])
print("验证集大小:", X_val.shape[0])
print("测试集大小:", X_test.shape[0])

# 标准化数据
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 应用主成分分析
pca = PCA()
pca.fit(X_scaled)

# 解释方差
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# 选择主成分数量
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
print("cumulative_variance_ratio:", cumulative_variance_ratio)
# 设定阈值,例如0.8
threshold = 0.95
n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1
print("Number of components to retain:", n_components)

# 转换数据
pca = PCA(n_components=n_components)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_val = pca.transform(X_val)
print(X_train.shape)
print(X_test.shape)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

# Assume X_train and X_test are 3D tensors (samples, timesteps, features)
# For tabular data, it might be (samples, features, 1) since there's no time dimension.
# Adjust input shape accordingly based on your data.
input_shape = X_train.shape[1:]
print(input_shape)
multiprocessing.freeze_support()
# 运行PBT算法

best_model = run_pbt(population_size=20, generations=8)
best_model.summary()

# 展示最好的模型在测试集上的性能
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
print(f"Best Model Test Accuracy: {test_accuracy}")

作者才做数据的预测时,简单运用过其它传统的统计分析模型,诸如SVM,决策树,逻辑回归和随机森林等,但效果都比较一般(详细可参看其他作者的模型分析:脱发数据集 - Heywhale.com),普遍准确率在0.5这里。而运用卷积神经网络配合PBT算法进行超参数优化后,准确率最多能提升到0.6。至于进一步的优化,或许可以朝着搭建更复杂的模型、增加所要优化的超参数种类,以及改进超参数的优化策略,有兴趣的可以尝试一下。

  • 7
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值