1、数据分析
Id |
Genetics |
Hormonal Changes |
Medical Conditions |
Medications & Treatments |
Nutritional Deficiencies |
Stress |
Age |
Poor Hair Care Habits |
Environmental Factors |
Smoking |
Weight Loss |
Hair Loss |
数据处理
1、缺省值舍去
# 数据清洗 import pandas as pd import numpy as np def clean_data(): HA = pd.read_csv("./Predict Hair Fall.csv") for i in range(len(HA)): for j in range(len(HA.columns)): if HA.iloc[i, j] == 'No Data': HA.iloc[i, j] = np.nan if 'Unnamed: 0' in HA.columns: HA.drop(columns=['Unnamed: 0'], inplace=True) clean_URL = './Cleaned Predict Hair Fall.csv' HA.to_csv(clean_URL, index=False) HA.info() print(HA) return clean_URL cleaned_file = clean_data() HB = pd.read_csv(cleaned_file) HB.info()
删除多余的id,在训练中不起作用,并进行映射
# 定义一个字典来映射 'yes' 到 1 和 'No' 到 0 mapping = {'Yes': 1, 'No': 0} # 使用 map 函数来应用映射到 DataFrame 的某一列(假设列名为 'Column_Name') HB['Genetics'] = HB['Genetics'].map(mapping) HB['Weight Loss '] = HB['Weight Loss '].map(mapping) HB['Hormonal Changes'] = HB['Hormonal Changes'].map(mapping) HB['Environmental Factors'] = HB['Environmental Factors'].map(mapping) HB['Poor Hair Care Habits '] = HB['Poor Hair Care Habits '].map(mapping) HB['Smoking'] = HB['Smoking'].map(mapping) HB.head()
将hair——loss移到最后一列做y
# # 最后,将该列插回到最后一列的位置 # df = pd.concat([df, column_to_move], axis=1) # HB.head() import pandas as pd # 假设 df 是你的 DataFrame,'Column_to_move' 是你想要移动的列名 # 获取除了 'Column_to_move' 以外的所有列名 columns = HB.columns.drop('Hair Loss') # 创建一个新的列名列表,将 'Column_to_move' 放在最后 new_columns = list(columns) + ['Hair Loss'] # 使用 reindex 方法按照新的列名列表重新排序 HB =HB.reindex(columns=new_columns) HB.head(5)
计算相关性热土
划分数据集
from sklearn.model_selection import train_test_split # 划分数据集 X = HB.iloc[:, :-1] y = HB.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) print(HB.columns[-1])
构建模型
# 构建2分类器模型 from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(hidden_layer_sizes=(50, 75, 100), activation='tanh', solver='lbfgs', learning_rate='invscaling',max_iter=500) mlp.fit(X_train, y_train)
进行随即搜索优化
网格搜索法寻找最优的超参数 from sklearn.model_selection import GridSearchCV from sklearn.neural_network import MLPClassifier # 定义参数网格 params = { 'activation': ['relu', 'tanh', 'logistic', 'identity'], 'hidden_layer_sizes': [(100,), (50, 100,), (50, 75, 100,)], 'solver': ['adam', 'sgd', 'lbfgs'], 'learning_rate': ['constant', 'adaptive', 'invscaling'] } # 初始化GridSearchCV,注意这里传入的是MLPClassifier类,而不是实例 mlp_classif_grid = GridSearchCV(MLPClassifier(), param_grid=params, n_jobs=-1, cv=5, verbose=5) # 拟合模型 mlp_classif_grid.fit(X_train, y_train) # 输出结果 print('Train Accuracy : %.3f' % mlp_classif_grid.best_estimator_.score(X_train, y_train)) print('Test Accuracy : %.3f' % mlp_classif_grid.best_estimator_.score(X_test, y_test)) print('Best Accuracy Through Grid Search : %.3f' % mlp_classif_grid.best_score_) print('Best Parameters : ', mlp_classif_grid.best_params_)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Train Accuracy : 1.000
Test Accuracy : 0.420
Best Accuracy Through Grid Search : 0.560
Best Parameters : {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'lbfgs'}
考虑对模型中数据删除
1.年龄:未对年龄编码,年龄分布差异较大
HC = HB.drop(['Age'], axis=1) HC.head(5)
同上构建模型优化
性能提升
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Train Accuracy : 1.000
Test Accuracy : 0.500
Best Accuracy Through Randomized Search : 0.556
Best Parameters : {'solver': 'lbfgs', 'learning_rate': 'invscaling', 'hidden_layer_sizes': (50, 150), 'activation': 'tanh'}