建模数据集不包含patient_id或case_no,但我们需要保存一版带id的数据集,自己看!
模型选择:
- 二分类模型。0-1
- 多分类模型。二分类和连续模型的中间模型,不像二分类那么粗糙,也不像回归要求连续数据,可以分段对抗缺失的重要连续数据。
- 回归模型。连续变量
目录
externel and internel k-fold cv for deep learning
1.8.4 extention-1:多分类,5-fold cross validation on all of data
1.8.5 extention-2:实在不行,从一折中挑个好的
'RuntimeError':error in LoadLibraryA
deepFM(Factorization-Machine based Neural Network)
区分模型参数和权重
模型参数和权重不是一回事
模型参数:模型参数是用来划分训练集里的块epochs,而不是权重,所有不同的训练集训练出来的模型不同,这种机制才能在5折交叉验证后避免过拟合!
权重:是训练集训练出来的公式(模型)的参数,即f(x)=w1x1+w2x2+...+wnxn的wn,模型的参数设置的不是这些权重w,而是划分训练集epochs的方式!最佳权重w是由训练集计算出来的。
所以,模型fit是在参数和训练集下学习权重,predict是在权重不变下预测。
建模效果不好原因:
model
- 模型
- 调参。调参可以影响5%的模型准确率。
data
模型不好的原因基本是数据的问题!
- 不同列差异过大。不同量纲可能导致模型效果不好-->取log/归一化。
- 同列差距过大,离散化。建模效果肯定不好-->取log/归一化、删除异常值
1. 建模
1.1 读入数据
df_model =pd.read_excel(project_path +'/data/result/df_model_data.xlsx')
if 'Unnamed: 0' in df_model.columns:
df_model = df_model.drop(['Unnamed: 0'], axis=1)
# 或者是 df_model =pd.read_excel(project_path +'/data/result/df_model_data.xlsx',index_col=0)
df_model.shape
# 查看共线性
df_model.corr()
discrete_col=['gender','谷胱甘肽','异甘草酸镁','托烷司琼','甘草酸苷','长春新碱','多烯磷脂酰']
continuous_col=[x for x in df_model.columns if x not in discrete_col]
continuous_col.remove('bmd_label')
1.2 数据归一化
from sklearn.preprocessing import StandardScaler
ss_x = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
# 单纯地除以最大值有漏洞
# 防止不同维特征数据差距过大,影响建模效果
for i in continuous_col:
max_value = df_model[i].max()
df_model[i]=df_model[i].apply(lambda x: round(x/max_value,3))
1.3 插补数据
# 使用随机森林对缺失值进行插补
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
def missing_value_interpolation(df):
df = df.reset_index(drop=True)
# 提取存在缺失值的列名
missing_list = []
for i in df.columns:
if df[i].isnull().sum() > 0:
missing_list.append(i)
missing_list_copy = missing_list.copy()
# 用该列未缺失的值训练随机森林,然后用训练好的rf预测缺失值
for i in range(len(missing_list)):
name=missing_list[0]
df_missing = df[missing_list_copy]
# 将其他列的缺失值用0表示。
missing_list.remove(name)
for j in missing_list:
df_missing[j]=df_missing[j].astype('str').apply(lambda x: 0 if x=='nan' else x)
df_missing_is = df_missing[df_missing[name].isnull()]
df_missing_not = df_missing[df_missing[name].notnull()]
y = df_missing_not[name]
x = df_missing_not.drop([name],axis=1)
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestRegressor(),param_grid=tree_grid_parameter,cv=3)
#rfr=RandomForestRegressor(random_state=0,n_estimators=100,n_jobs=-1)
#根据已有数据去拟合随机森林模型
grid.fit(x, y)
rfr = RandomForestRegressor(n_estimators=grid.best_params_['n_estimators'])
rfr.fit(x, y)
#预测缺失值
predict = rfr.predict(df_missing_is.drop([name],axis=1))
#填补缺失值
df.loc[df[name].isnull(),name] = predict
return df
# 插补建模数据
df_model_cb=missing_value_interpolation(df_model)
# 保存插补数据
writer = pd.ExcelWriter(project_path + '/data/result/df_model_data_插补.xlsx')
df_model_cb.to_excel(writer)
writer.save()
-->相关性检测
# 连续变量,spearmanr相关性检验(统计量r);
print('--------------------------计算连续变量的spearmanr相关性系数---------------------------------')
from scipy import stats
t_list = []
p_list = []
q_list = []
for i in continuous_col:
# 删除连续变量中的<、>号
df_model_cb[i] = df_model_cb[i].astype('str').apply(lambda x: re.sub(r'<|>', '',x))
x= df_model_cb[df_model_cb[i].astype('float').notnull()][i]
y= df_model_cb[df_model_cb[i].astype('float').notnull()]['bmd_label']
t, p = stats.spearmanr(x,y)
t = round(t, 2)
p = round(p, 3)
q = '斯皮尔曼'
# print(i, t, p)
t_list.append(t)
p_list.append(p)
q_list.append(q)
df_spearmanr= pd.DataFrame(data={'连续检测指标': continuous_col,
't值': t_list,
'p值': p_list,
'方法': q_list})
df_spearmanr_1 = df_spearmanr[df_spearmanr['p值'] <= 0.05]
df_spearmanr_2 = df_spearmanr[df_spearmanr['p值'] >= 0.05] # 显著性不成立
df_spearmanr = pd.concat([df_spearmanr_1,df_spearmanr_2], axis=0)
df_spearmanr=df_spearmanr.sort_values(by=['p值'],ascending=False)
df_spearmanr = df_spearmanr.reset_index(drop=True)
writer = pd.ExcelWriter(project_path + '/data/result/df_temp_spearmanr相关性检测.xlsx')
df_spearmanr.to_excel(writer)
writer.save()
1.4 划分数据集
1.4.1 计算随机数种子
from auto_ml.utils_models import load_ml_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
# 划分训练集和测试集,比例为8:2
x = df_model_cb.drop(['日剂量'],axis=1)
y = df_model_cb['日剂量']
seeds_list=[]
cat_f1_list=[]
for i in range(101):
tran_x, test_x, tran_y, test_y = train_test_split(x, y, test_size=0.2, random_state=i)
# 分类数据过采样
from imblearn.over_sampling import SMOTE,ADASYN
sm = SMOTE(random_state=0)
tran_x_sm,tran_y_sm = sm.fit_resample(tran_x,tran_y)
import catboost
# CatBoost模型
cat_model=catboost.CatBoostClassifier(iterations=300,
learning_rate=0.2,
depth=6,
l2_leaf_reg=2,
subsample=1,
loss_function='CrossEntropy', # 'RMSE', #
random_state=3)
# 分类模型
cat_model.fit(tran_x_sm,tran_y_sm)
cat_predictions=cat_model.predict(test_x)
cat_f1=f1_score(test_y,cat_predictions)
# 防止分类数据的测试集划分不平衡
if not ()6>=(test_y.value_counts().values[-2])/(test_y.value_counts().values[-1]) >= 3):
continue
# # 回归模型
# cat_model.fit(tran_x,tran_y)
# cat_predictions=cat_model.predict(test_x)
# cat_f1=r2_score(test_y,cat_predictions)
seeds_list.append(i)
cat_f1_list.append(cat_f1)
df_seeds=pd.DataFrame(data={'seed':seeds_list,
'cat_f1':cat_f1_list})
df_seeds=df_seeds.sort_values(['cat_f1'], ascending=0).reset_index(drop=True)
df_seeds.to_excel(project_path+'/data/df_seeds.xlsx')
1.4.2 划分数据集
# 分类随机数种子
from auto_ml.utils_models import load_ml_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
# 划分训练集和测试集,比例为8:2
x = df_model_cb.drop(['日剂量'],axis=1)
y = df_model_cb['日剂量']
seed_index=df_seeds.loc[0,'seed']
tran_x, test_x, tran_y, test_y = train_test_split(x, y, test_size=0.2, random_state=seed_index)
# 防止分类数据的测试集划分不平衡
if (test_y.value_counts().values[-2])/(test_y.value_counts().values[-1]) > 3:
print('测试集划分不平衡')
sys.exit()
# 防止分类数据的训练集划分不平衡
if (tran_y.value_counts().values[-2])/(tran_y.value_counts().values[-1]) > 10:
print('训练集划分可能不平衡')
sys.exit()
seed_index
df_model.shape
tran_y.value_counts()
test_y.value_counts()
1.5 训练集过采样
如果目标分类变量数据分布不平衡,则需要进行过采样。
# 进行过采样
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.combine import SMOTETomek
sm = SMOTE(random_state=0)
# sm=ADASYN(random_state=0)
tran_x_sm,tran_y_sm = sm.fit_resample(tran_x,tran_y)
1.6 model:统一模型训练和输出格式
二分类模型
二分类模型一定要注意目标变量-阳性样本的标记,通常来说会把多数样本定义为0,少数样本定义为1。因为sklearn.metrics中precision_score,recall_score,f1_score指标默认计算的是label=1的指标,如果把多数样本标记为1,则在逐步向前算法中计算的f1指标会居高不下!!
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
import xgboost
# XGBoost模型
xgb_model=xgboost.XGBClassifier(max_depth=5,
learning_rate=0.018,
n_estimators=500,
min_child_weight=0.6,
eta=0.1,
gamma=0.5,
reg_lambda=5,
subsample=0.8,
colsample_bytree=0.6,
nthread=4,
scale_pos_weight=1,
random_state=3)
xgb_model.fit(tran_x_sm,tran_y_sm)
xgb_predictions=xgb_model.predict(test_x)
import lightgbm
# LightGBM模型
lgbm_model=lightgbm.LGBMClassifier(iterations=300,
max_depth=8,
min_child_weight=0.9,
gamma=0.5,
reg_lambda=5,
subsample=0.4,
learning_rate=0.2,
loss_function='CrossEntropy',
random_state=3)
lgbm_model.fit(tran_x_sm,tran_y_sm)
lgbm_predictions=lgbm_model.predict(test_x)
import catboost
# CatBoost模型
cat_model=catboost.CatBoostClassifier(iterations=300,
learning_rate=0.2,
depth=6,
l2_leaf_reg=2,
subsample=1,
loss_function='CrossEntropy',
random_state=3)
cat_model.fit(tran_x_sm,tran_y_sm)
cat_predictions=cat_model.predict(test_x)
# 随机森林
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestClassifier(), param_grid=tree_grid_parameter, cv=3)
# 根据已有数据去拟合随机森林模型
grid.fit(tran_x_sm, tran_y_sm)
rf_model = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'],
max_depth=8,
random_state=3)
rf_model.fit(tran_x_sm, tran_y_sm)
# 预测缺失值
rf_predictions = rf_model.predict(test_x)
# GBDT
# 列出参数列表
gbdt_model = GradientBoostingClassifier(n_estimators=300,
learning_rate=0.1,
max_depth=8,
subsample=0.4,
random_state=3)
gbdt_model.fit(tran_x_sm,tran_y_sm)
# 预测缺失值
gbdt_predictions = gbdt_model.predict(test_x)
# SVR
from sklearn.svm import SVR,SVC
# 回归模型
# svr = SVR(kernel='linear', C=1.25)
# 分类模型
svr_model = SVC(kernel='rbf',
C=50,
cache_size=200,
probability=True,
random_state=3)
svr_model.fit(tran_x_sm,tran_y_sm)
svr_predictions=svr_model.predict(test_x)
# Linear回归,Lasso回归,领回归,logistic回归
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,LogisticRegression
lcv_model = LogisticRegression(penalty='l2',
C=5,
solver='lbfgs',
max_iter=100,
random_state=3)
# lcv = Lasso()
# lcv = Ridge()
lcv_model.fit(tran_x_sm, tran_y_sm)
lcv_predictions = lcv_model.predict(test_x)
# ANN
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
ANN_model = MLPClassifier(alpha=0.1,
hidden_layer_sizes=[100,],
solver='adam',
activation='relu',
random_state=3)
ANN_model.fit(tran_x_sm, tran_y_sm)
ANN_predictions=ANN_model.predict(test_x)
# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
TabNet_model = TabNetClassifier(n_d=8,
n_a=8,
n_steps=3, # Number of steps in the architecture (usually between 3 and 10)
gamma=1.5,
n_independent=2) #TabNetRegressor()
tran_x_x, tran_x_valid, tran_y_y, tran_y_valid = train_test_split(tran_x_sm, tran_y_sm, test_size=0.125, random_state=3)
TabNet_model.fit(X_train=tran_x_x.to_numpy(),
y_train=tran_y_y.to_numpy(),
eval_set=[(tran_x_valid.to_numpy(), tran_y_valid.to_numpy())],
eval_name=['train'],
eval_metric=['auc'],
max_epochs=200,
patience=50,
batch_size=128,
virtual_batch_size=14,
num_workers=0,
drop_last=False)
TabNet_predictions=TabNet_model.predict(test_x.to_numpy())
# 计算评价指标compute evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix
# 统一模型输出结果
df_model_result=pd.DataFrame(
columns=['model','index','precision','recall','f1-score','support','accuracy','AUC','sensitivity','specificity','FPR','FNR','Youden'])
model_list=[xgb_model,lgbm_model,cat_model,rf_model,gbdt_model,svr_model,lcv_model,ANN_model,TabNet_model]
model_name_list=['XGBoost','LGBM','CatBoost','RF','GBDT','SVR','LR','ANN','TabNet']
for model,name in zip(model_list,model_name_list):
# print(name)
# 计算accuracy和AUC
if name =='TabNet':
test_x_temp=test_x.copy()
test_x=test_x.to_numpy()
# 计算accuracy和AUC
test_y_score=model.predict_proba(test_x)[:,-1]
auc=roc_auc_score(test_y,test_y_score)
auc=round(auc,4)
accuracy=accuracy_score(test_y,model.predict(test_x))
accuracy=round(accuracy,4)
# 计算灵敏度sensitivity和特异度spe # 计算灵敏度sensitivity和特异度specificity
# 计算灵敏度、特异度、假阴性率、假阳性率
tn, fp, fn, tp = confusion_matrix(test_y,model.predict(test_x)).ravel()
sensitivity=round(tp/(tp+fn),2)
specificity=round(tn/(fp+tn),2)
FPR=round(fp/(fp+tn),2)
FNR=round(fn/(fn+tp),2)
# 计算约登指数
youden_index=sensitivity+specificity-1
df_model_result.loc[df_model_result.shape[0],['model','accuracy','AUC','sensitivity','specificity','FPR','FNR','Youden']]=\
[name,accuracy,auc,sensitivity,specificity,FPR,FNR,youden_index]
# 并入二分类的P-R-f1
# 提取classification_report结果
report = classification_report(test_y, model.predict(test_x), output_dict=True) # output_dict转化为字典类型
df_report = pd.DataFrame(report).transpose() # 转置
df_report=df_report.apply(lambda x: round(x,4),axis=0)
df_report=df_report.reset_index(drop=True)
df_model_result=pd.concat([df_model_result,df_report.loc[0:1,:].reset_index()],axis=0,sort=False)
df_model_result=df_model_result.reset_index(drop=True)
if name=='TabNet':
test_x=test_x_temp.copy()
df_model_result.rename(columns={'model':'',
'index':'label'},inplace=True)
# 保存模型测试效果
df_model_result.to_excel(project_path+'/data/df_模型测试效果.xlsx')
多分类模型
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve,brier_score_loss
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
import xgboost
# XGBoost模型
xgb_model=xgboost.XGBClassifier(max_depth=3,
learning_rate=0.004,
n_estimators=500,
min_child_weight=0.3,
eta=0.1,
gamma=0.4,
reg_lambda=15,
subsample=0.8,
colsample_bytree=0.8,
nthread=4,
scale_pos_weight=1,
random_state=3)
xgb_model.fit(tran_x_sm,tran_y_sm)
xgb_predictions=xgb_model.predict(test_x)
import lightgbm
# LightGBM模型
lgbm_model=lightgbm.LGBMClassifier(iterations=300,
max_depth=4,
min_child_weight=0.5,
gamma=0.5,
reg_lambda=5,
subsample=0.8,
learning_rate=0.02,
loss_function='CrossEntropy',
random_state=3)
lgbm_model.fit(tran_x_sm,tran_y_sm)
lgbm_predictions=lgbm_model.predict(test_x)
import catboost
# CatBoost模型
cat_model=catboost.CatBoostClassifier(iterations=400,
learning_rate=0.008,
depth=3,
l2_leaf_reg=2,
loss_function='MultiClass',
random_state=3)
cat_model.fit(tran_x_sm,tran_y_sm)
cat_predictions=cat_model.predict(test_x)
# 随机森林
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestClassifier(), param_grid=tree_grid_parameter, cv=3)
# 根据已有数据去拟合随机森林模型
grid.fit(tran_x_sm, tran_y_sm)
rf_model = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'],
max_depth=8,
random_state=3)
rf_model.fit(tran_x_sm, tran_y_sm)
# 预测缺失值
rf_predictions = rf_model.predict(test_x)
# GBDT
# 列出参数列表
gbdt_model = GradientBoostingClassifier(n_estimators=300,
learning_rate=0.1,
max_depth=8,
subsample=0.4,
random_state=3)
gbdt_model.fit(tran_x_sm,tran_y_sm)
# 预测缺失值
gbdt_predictions = gbdt_model.predict(test_x)
# SVR
from sklearn.svm import SVR,SVC
# 回归模型
# svr = SVR(kernel='linear', C=1.25)
# 分类模型
svr_model = SVC(kernel='rbf',
C=50,
cache_size=200,
probability=True,
random_state=3)
svr_model.fit(tran_x_sm,tran_y_sm)
svr_predictions=svr_model.predict(test_x)
# Linear回归,Lasso回归,领回归,logistic回归
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,LogisticRegression
lcv_model = LogisticRegression(penalty='l2',
C=5,
solver='lbfgs',
max_iter=100,
random_state=3)
# lcv = Lasso()
# lcv = Ridge()
lcv_model.fit(tran_x_sm, tran_y_sm)
lcv_predictions = lcv_model.predict(test_x)
# ANN
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
ANN_model = MLPClassifier(alpha=0.1,
hidden_layer_sizes=[100,],
solver='adam',
activation='relu',
random_state=3)
ANN_model.fit(tran_x_sm, tran_y_sm)
ANN_predictions=ANN_model.predict(test_x)
# TabNet
import torch
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier
TabNet_model = TabNetMultiTaskClassifier(
n_d=3,
n_a=3,
n_steps=5,
gamma=1,
# optimizer_fn=torch.optim.Adam,
# optimizer_params=dict(lr=2e-2),
# scheduler_params={"step_size":50, # how to use learning rate scheduler
# "gamma":0.9},
# scheduler_fn=torch.optim.lr_scheduler.StepLR,
mask_type='entmax') # "sparsemax"
tran_x_x, tran_x_valid, tran_y_y, tran_y_valid = train_test_split(tran_x_sm, tran_y_sm, test_size=0.125, random_state=3)
TabNet_model.fit(X_train=tran_x_sm.to_numpy(),
y_train=tran_y_sm.to_numpy().reshape(-1,1),
max_epochs=200,
patience=50,
batch_size=64,
virtual_batch_size=16,
num_workers=0,
drop_last=False,
loss_fn=[torch.nn.functional.cross_entropy]) # Optional, just an example of list usage
TabNet_predictions=TabNet_model.predict(test_x.to_numpy())
# 计算评价指标compute evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix
# 统一模型输出结果
df_model_result=pd.DataFrame(columns=['model','precision','recall','f1','accuracy','AUC'])
model_list=[xgb_model,lgbm_model,cat_model,rf_model,gbdt_model,svr_model,lcv_model,ANN_model,TabNet_model]
model_name_list=['XGBoost','LGBM','CatBoost','RF','GBDT','SVR','LR','ANN','TabNet']
# model_list=[xgb_model]
# model_name_list=['XGBoost']
for model,name in zip(model_list,model_name_list):
print(name)
if name =='TabNet':
test_temp=test_x.copy()
test_x=test_x.to_numpy()
# 计算accuracy和AUC
# tabnet predict_proba结果是三维数组,无法计算auc,需要reshape(-1,6)=all rows*6 columns
test_y_score=np.reshape(model.predict_proba(test_x),(-1,6))
auc=roc_auc_score(test_y,test_y_score,multi_class='ovr')
auc=round(auc,2)
# tabnet predict结果是三维数组,无法计算auc,需要reshape
predictions=np.reshape(model.predict(test_x),(-1,1)).astype(str)
accuracy=accuracy_score(test_y.astype(str),predictions)
accuracy=round(accuracy,2)
# 计算precision、recall、F1
precision=precision_score(test_y.astype(str),predictions,average='macro')
precision=round(precision,2)
recall=recall_score(test_y.astype(str),predictions,average='macro')
recall=round(recall,2)
f1=f1_score(test_y.astype(str),predictions,average='macro')
f1=round(f1,2)
if name =='TabNet':
test_x=test_temp.copy()
df_model_result.loc[df_model_result.shape[0],['model','precision','recall','f1','accuracy','AUC']]=\
[name,precision,recall,f1,accuracy,auc]
# 变量重要性评分
importance = xgb_model.feature_importances_
df_importance_temp=pd.DataFrame(data={'特征':tran_x.columns,'重要性评分':importance})
回归模型
from sklearn.metrics import r2_score,average_precision_score,precision_recall_curve
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
import xgboost
# XGBoost模型
xgb_model=xgboost.XGBRegressor(max_depth=5,
learning_rate=0.018,
n_estimators=500,
min_child_weight=0.6,
eta=0.1,
gamma=0.5,
reg_lambda=5,
subsample=0.8,
colsample_bytree=0.6,
nthread=4,
scale_pos_weight=1,
random_state=3)
xgb_model.fit(tran_x,tran_y)
xgb_predictions=xgb_model.predict(test_x)
import lightgbm
# LightGBM模型
lgbm_model=lightgbm.LGBMRegressor(iterations=300,
max_depth=8,
min_child_weight=0.9,
gamma=0.5,
reg_lambda=5,
subsample=0.4,
learning_rate=0.2,
loss_function='MAE',
random_state=3)
lgbm_model.fit(tran_x,tran_y)
lgbm_predictions=lgbm_model.predict(test_x)
import catboost
# CatBoost模型
cat_model=catboost.CatBoostRegressor(iterations=300,
learning_rate=0.2,
depth=6,
l2_leaf_reg=2,
subsample=1,
loss_function='MAE',
random_state=3)
cat_model.fit(tran_x,tran_y)
cat_predictions=cat_model.predict(test_x)
# 随机森林
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
# 列出参数列表
tree_grid_parameter = {'n_estimators': list((10, 50, 100, 150, 200))}
# 进行参数的搜索组合
grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_grid_parameter, cv=3)
# 根据已有数据去拟合随机森林模型
grid.fit(tran_x, tran_y)
rf_model = RandomForestRegressor(n_estimators=grid.best_params_['n_estimators'],
max_depth=8,
random_state=3)
rf_model.fit(tran_x, tran_y)
# 预测缺失值
rf_predictions = rf_model.predict(test_x)
# GBDT
# 列出参数列表
gbdt_model = GradientBoostingRegressor(n_estimators=300,
learning_rate=0.1,
max_depth=8,
subsample=0.4,
random_state=3)
gbdt_model.fit(tran_x,tran_y)
# 预测缺失值
gbdt_predictions = gbdt_model.predict(test_x)
# SVR
from sklearn.svm import SVR,SVC
# 回归模型
# svr = SVR(kernel='linear', C=1.25)
# 分类模型
svr_model = SVR()
svr_model.fit(tran_x,tran_y)
svr_predictions=svr_model.predict(test_x)
# Linear回归,Lasso回归,领回归,logistic回归
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet,LogisticRegression
# lcv_model = LogisticRegression()
lcv_model = Lasso()
# lcv = Ridge()
lcv_model.fit(tran_x, tran_y)
lcv_predictions = lcv_model.predict(test_x)
# ANN
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.metrics import classification_report,confusion_matrix
ANN_model = MLPRegressor(alpha=0.1,
hidden_layer_sizes=[100,],
solver='adam',
activation='relu',
random_state=3)
ANN_model.fit(tran_x, tran_y)
ANN_predictions=ANN_model.predict(test_x)
# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
TabNet_model = TabNetRegressor(n_d=8,
n_a=8,
n_steps=3, # Number of steps in the architecture (usually between 3 and 10)
gamma=1.5,
n_independent=2) #TabNetRegressor()
tran_x_x, tran_x_valid, tran_y_y, tran_y_valid = train_test_split(tran_x, tran_y, test_size=0.125, random_state=3)
TabNet_model.fit(X_train=tran_x_x.to_numpy(),
y_train=tran_y_y.to_numpy().reshape(-1,1),
eval_set=[(tran_x_valid.to_numpy(), tran_y_valid.to_numpy().reshape(-1,1))],
eval_name=['train'],
eval_metric=['mae'],
max_epochs=200,
patience=50,
batch_size=128,
virtual_batch_size=14,
num_workers=0,
drop_last=False)
TabNet_predictions=TabNet_model.predict(test_x.to_numpy())
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
# 统一模型结果
df_model_result=pd.DataFrame(
columns=['model','R2','RMSE','MAE','Accuracy within ± 10% range','Accuracy within ± 20% range','Accuracy within ± 30% range',
'Accuracy within ± 40% range'])
model_list=[xgb_model,lgbm_model,cat_model,rf_model,gbdt_model,svr_model,lcv_model,ANN_model,TabNet_model]
model_name_list=['XGBoost','LGBM','CatBoost','RF','GBDT','SVR','LR','ANN','TabNet']
for model,name in zip(model_list,model_name_list):
# print(name)
# 计算R2、RMSE、MAE
if name == 'TabNet':
predictions=model.predict(test_x.to_numpy())
else:
predictions=model.predict(test_x)
r2=r2_score(test_y,predictions)
r2=round(r2,4)
mae=mean_absolute_error(test_y,predictions)
mae=round(mae,4)
rmse=mean_squared_error(test_y,predictions) ** 0.5
rmse=round(rmse,4)
# 计算'Accuracy within ± 10%, 20%, 30%, 40% range'
accuracy_10_list = [ (i,j) for