华为数学建模2021 D题

本文介绍了通过XGBoost和随机森林算法进行特征选择,以降低维度并筛选出对目标变量影响显著的20个特征。接着,使用XGBoost建立回归模型,评估模型性能,并探讨了不同模型对活性的影响。同时,针对ADMET的分类任务,运用多种分类器进行建模,最终选择效果最佳的模型。文章还探讨了特征与活性的定量关系,通过相关性分析和控制变量法理解各变量对活性的贡献。
摘要由CSDN通过智能技术生成


大致分享一下自己在建模过程中的思路,仅仅是做个记录


第一题:选取感兴趣的20个变量

对于729个属性变量建立回归和分类模型而言显得相对有点多,因此需要维度压缩,通常在使用维度压缩时有很多方法如:PCA、DAN(深度自编码)等,但是这样得到的维度压缩不会对应到相应的原有属性,类似于特征映射而非筛选。因此第一题采取的思路是选用XGBoost和Random Forest和信息增益[1]筛选共有的特征。

import xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import numpy as np
from imblearn.metrics import geometric_mean_score

# 读取文件内容
input = pd.read_excel(r"D:\BaiduNetdiskDownload\2021年D题\Molecular_Descriptor.xlsx", sheet_name="training")
output = pd.read_excel(r'D:\BaiduNetdiskDownload\2021年D题\ERα_activity.xlsx',sheet_name='training')
# 区分自变量(X)和因变量(Y)
independentCol = input.columns[1:-1]
dependentCol = output.columns[-1]
X = input[independentCol]
Y = output[dependentCol]

# 修改自变量名称为正整数,为后面的训练做准备
numForCol = [i for i in range(0, len(independentCol), 1)]
colReplace = dict(zip(independentCol, numForCol))
X = X.rename(columns=colReplace)

# 随机获得训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.8, shuffle=True)

# 初始化 gmean(0) 和 指标组合(空)
gmean = 0
feaFinal = set()

# 不断简化指标组合,以获得最优指标集
while (1):
    # 按步骤二随机抽取80%样本,进行只保留重要性大于0的指标
    x_1, _, y_1, _ = train_test_split(x_train, y_train, test_size=0.8, shuffle=True)
    m1 = xgboost.XGBRegressor(use_label_encoder=False)
    m1.fit(x_1, y_1)  # 讲样本放入XGBoost
    f1 = dict(zip(x_1.columns, m1.feature_importances_))  # 讲指标序号和重要性分数进行对应
    f1 = {k: v for (k, v) in f1.items() if v > 0}  # 保留重要性分数大于0的指标
    f1 = set(f1.keys())  # 讲指标序号以set形式存放

    # 步骤三,与步骤二相似,获得另一组指标
    x_2, _, y_2, _ = train_test_split(x_train, y_train, test_size=0.8, shuffle=True)
    m2 = xgboost.XGBRegressor(use_label_encoder=False)
    m2.fit(x_2, y_2)
    f2 = dict(zip(x_2.columns, m2.feature_importances_))
    f2 = {k: v for (k, v) in f2.items() if v > 0}
    f2 = set(f2.keys())

    # 取两个指标组的交集,并将该交集放入XGBoost
    featureUnion = f1.union(f2)
    x_train = x_train[featureUnion]
    x_test = x_test[featureUnion]
    model = xgboost.XGBRegressor(use_label_encoder=False)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)  # 进行预测

    # 利用预测结果和y_test, 计算gmean
    gmean_new = r2_score(y_test, y_pred)

    # 若新计算的gmean上次小或一样,则优化结束
    if gmean_new <= gmean:
        break
    # 否则,更新最佳的gmean和指标选择
    else:
        gmean = gmean_new
        feaFinal = featureUnion
        # print("Update: " + str(gmean_new))
        # print("Update: " + str(feaFinal))

findName = {v: k for k, v in colReplace.items()}
feaFinal = sorted(feaFinal)
SelectedFeatures = [findName[i] for i in feaFinal]
# print("指标集最优解:" + str(SelectedFeatures))
# print("最后G-mean: " + str(gmean))
col_im = str(SelectedFeatures)


import pandas as pd
from xgboost import XGBClassifier,XGBRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
file_label_act = r'D:\BaiduNetdiskDownload\2021年D题\ERα_activity.xlsx'
file_feature = r'D:\BaiduNetdiskDownload\2021年D题\Molecular_Descriptor.xlsx'
data_label_activity = pd.read_excel(file_label_act,sheet_name='training')
data_feature = pd.read_excel(file_feature,sheet_name='training')
x = np.array(data_feature.iloc[:,1:])
y = np.array(data_label_activity.iloc[:,-1])
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)
#随机森林
# cls = XGBRegressor()
cls = RandomForestRegressor()
cls.fit(x_train,y_train)
f  = cls.feature_importances_
f_l = data_feature.columns.tolist()
# print('集成学习筛选特征前模型的得分效果:',cls.score(x_val,y_val))
dic_ = {j:i for i,j in zip(f,f_l)}
d_order=sorted(dic_.items(),key=lambda x:x[1],reverse=True)  # 按字典集合中,每一个元组的第二个元素排列。                                                          # x相当于字典集合中遍历出来的一个元组。
col_rf = [i[0] for i in d_order[:30]]

#Xgboost
cls = XGBRegressor()
cls.fit(x_train,y_train)
f  = cls.feature_importances_
f_l = data_feature.columns.tolist()
# print('集成学习筛选特征前模型的得分效果:',cls.score(x_val,y_val))
dic_ = {j:i for i,j in zip(f,f_l)}
d_order=sorted(dic_.items(),key=lambda x:x[1],reverse=True)  # 按字典集合中,每一个元组的第二个元素排列。                                                          # x相当于字典集合中遍历出来的一个元组。
col_xg = [i[0] for i in d_order[:30]]


print('rf:',len(col_rf),'xg:',len(col_xg),'im:',len(col_im))
col_xg_ = []
for i in col_rf:
    if i in col_xg:
        col_xg_.append(i)
print('xg:',len(col_xg_),col_xg_)

col_im_ = []
for i in col_rf:
    if i in col_im:
        col_im_.append(i)
print('im:',len(col_im_),col_im_)

col_xg_im = []
for i in col_rf:
    if i in col_im or i in col_xg:
        col_xg_im.append(i)
print('xg or im:',len(col_xg_im),col_xg_im)

col_xg_im_and = []
for i in col_rf:
    if i in col_im and i in col_xg:
        col_xg_im_and.append(i)
print('xg and im:',len(col_xg_im_and),col_xg_im_and)

col_ = col_xg_im_and.copy()
x = np.array(data_feature[col_])
y = np.array(data_label_activity.iloc[:,-1])
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)
cls = RandomForestRegressor()
cls.fit(x_train,y_train)
print('集成学习筛选特征前模型的得分效果:',cls.score(x_val,y_val))

for i in col_xg_:
    if i not in col_:
        col_.append(i)
for i in col_im_:
    if i not in col_:
        col_.append(i)

col_ = col_[:20]

第二题建立回归模型

利用随机森林和XGBoost在筛选的特征上建立活性回归模型

col_ =  ['MDEC-22', 'gmin', 'minHBint10', 'minaaN', 'maxHBint10', 'minHBint4', 'SHBint10', 'MDEO-11', 'C2SP1', 'maxdO', 'MDEN-33', 'fragC', 'ntsC', 'SHBint9', 'MDEC-24', 'BCUTw-1h', 'nAtomLAC', 'ATSc2', 'WPOL', 'ATSc1']

print('final col:',len(col_),col_)
x = np.array(data_feature[col_])
y = np.array(data_label_activity.iloc[:,-1])
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)
# cls = RandomForestRegressor()
cls = XGBRegressor()
import xgboost as xgb
import matplotlib.pyplot as plt

cls.fit(x_train,y_train)
xgb.plot_tree(cls)
xgb.plot_tree(cls, num_trees=0)
fig = plt.gcf()
fig.set_size_inches(150, 100)
# plt.show()
fig.savefig(r'tree.png')
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=col_)
param = {}
# use softmax multi-class classification
# param['objective'] = 'binary:logistic'
# scale weight of positive examples
# param['eta'] = 0.1
# param['max_depth'] = 6
# param['silent'] = 1
# param['nthread'] = 4
# param['num_class'] = 9
model = xgb.train(param, dtrain)


xgb.plot_importance(model,height=0.8,title='活性分析的特征重要性', ylabel='特征')
plt.rc('font', family='Arial Unicode MS', size=14)
plt.show()

print('集成学习筛选特征前模型的得分效果:',cls.score(x_val,y_val))

plt.figure()
plt.plot(range(len(x_val)),y_val,label='real')
# plt.plot(range(len(x_val)),cls.predict(x_val),label='xgb')


xgb_reg = RandomForestRegressor()
xgb_reg.fit(x_train,y_train)
print('xgb score:',xgb_reg.score(x_val,y_val))

plt.plot(range(len(x_val)),xgb_reg.predict(x_val),label='pred')
plt.title('RF')

plt.legend()
plt.show()

在这里插入图片描述

第三题 建立五个因变量(ADMET)的分类模型

利用sklearn调用11个分类模型的api,最终选取五个效果比较好的模型,再进行集成对五个变量分类

from xgboost import XGBClassifier,XGBRegressor
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier,BaggingClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score,recall_score,r2_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

file_feature = r'D:\BaiduNetdiskDownload\2021年D题\Molecular_Descriptor.xlsx'
file_label = r'D:\BaiduNetdiskDownload\2021年D题\ADMET.xlsx'
#Caco-2 CYP3A4 hERG HOB MN
x = np.array(pd.read_excel(file_feature,sheet_name='training').iloc[:,1:])
y = np.array(pd.read_excel(file_label,sheet_name='training')['Caco-2'].astype('int'))

x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)

model = [GradientBoostingClassifier(),AdaBoostClassifier(),
             XGBClassifier(),RandomForestClassifier(),
             ExtraTreesClassifier()]
name = ['GDBC','AdaC','XGB','RF','EtC']

# rf = RandomForestClassifier()
# xgb = XGBClassifier()
#
# xgb.fit(x_train,y_train)
# print(xgb.score(x_val,y_val))

def fit_evaluate_1(x_train,x_val,y_train,y_val,model,name,x_test):
    result = []
    result_test = []
    for j,i in zip(name,model):
        i.fit(x_train,y_train)
        print(j,':',i.score(x_val,y_val))
        res_ = i.predict(x_val)
        res_test = i.predict(x_test)
        result.append(res_)
        result_test.append(res_test)
    result_test = np.array(result_test).T
    result = np.array(result).T  #5,100
    result_ = []
    for i in result:
        if sum(i)>=1+len(name)//2:
            result_.append(1)
        else:
            result_.append(0)
    print('precision:',precision_score(y_val,result_))
    print('recall:', recall_score(y_val, result_))
    print('accuracy:',accuracy_score(y_val,result_))
    print('\n\n')
    # print('r2:',r2_score(y_val,result_))

    result_ = []
    for i in result_test:
        if sum(i) >= 1 + len(name) // 2:
            result_.append(1)
        else:
            result_.append(0)

    return result_

# print(fit_evaluate_1(x_train,x_val,y_train,y_val,model,name))
# fit_evaluate_1(x_train,x_val,y_train,y_val,model,name)


x = pd.read_excel(file_feature,sheet_name='training').iloc[:,1:]
y = pd.read_excel(file_label,sheet_name='training')
# x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)



def fit_all(x,y,model,name):
    x_in = pd.read_excel(file_feature,sheet_name='test').iloc[:,1:]
    y_in = pd.read_excel(file_label, sheet_name='training')
    y_out = pd.read_excel(file_label, sheet_name='test')
    x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.1)
    columns_ = ['Caco-2', 'CYP3A4', 'hERG', 'HOB', 'MN']
    for i in columns_:
        print('The next fit is ',i)
        x_train_,x_val_,y_train_,y_val_ = np.array(x_train),np.array(x_val),np.array(y_train[i]),np.array(y_val[i])
        res_ = fit_evaluate_1(x_train_, x_val_, y_train_, y_val_, model, name,np.array(x_in))
        y_out[i] = res_
    print(y_out)
    writer = pd.ExcelWriter(file_label)  # 写入Excel文件
    y_in.to_excel(writer, 'training', float_format='%.10f')
    y_out.to_excel(writer, 'test', float_format='%.10f')  # ‘page_1’是写入excel的sheet名
    writer.save()

    writer.close()
fit_all(x,y,model,name)

第四题 每个变量对活性的定量分析

这一题属于比较开放性的问题,主要思路1、求解相关性系数矩阵并可视化;2、控制变量法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
from einops import rearrange, repeat
file_label_act = r'D:\BaiduNetdiskDownload\2021年D题\ERα_activity.xlsx'
file_feature = r'D:\BaiduNetdiskDownload\2021年D题\Molecular_Descriptor.xlsx'
file_label_cls = r'D:\BaiduNetdiskDownload\2021年D题\ADMET.xlsx'

col_ =  ['MDEC-22', 'gmin', 'minHBint10', 'minaaN', 'maxHBint10', 'minHBint4', 'SHBint10', 'MDEO-11', 'C2SP1', 'maxdO', 'MDEN-33', 'fragC', 'ntsC', 'SHBint9', 'MDEC-24', 'BCUTw-1h', 'nAtomLAC', 'ATSc2', 'WPOL', 'ATSc1']

x = pd.read_excel(file_feature,sheet_name='training')[col_]
y = pd.read_excel(file_label_act,sheet_name='training').iloc[:,-1]
y_cls = pd.read_excel(file_label_cls,sheet_name='training').iloc[:,-1]

q = 0

x['label'] = np.array(y_cls)     #AMDET用这个
# x['label'] = np.array(y)       #回归活性用这个

# print(x.head())

# rf = RandomForestRegressor()     #回归活性用这个
rf = RandomForestClassifier()      #AMDET用这个
rf.fit(np.array(x.iloc[:,:-1]),np.array(x.iloc[:,-1]))    #后面的-1代表AMDET的T,换成-2代表AMDET的E
                                                            #以此类推,回归活性就是-1不用修改

# print(x.head())
gmin = np.array(x['%s'%col_[q]])
gmin_scatter = np.arange(min(gmin)-0.5,max(gmin)+0.5,0.01)

sample = [np.array(x.iloc[0,:-1]).tolist()]

gmin_,y_ = sample[0][q],float(x.iloc[0,-1])
plt.scatter(gmin_,y_,label='real',c='r')

act = []
for i in gmin_scatter:

    sample[0][q] = i
    act.append(rf.predict(sample))

plt.plot(gmin_scatter,act,label='predict')
plt.title('%s对活性的影响'%col_[q])
plt.xlabel('%s'%col_[q])
plt.ylabel('活性')
plt.legend()
plt.show()









# x = np.array(x.corr())
# # print(x["label"].corr(x[col_]))
# print(x)
#
# print(x.shape)
#
# x = np.clip(x,0,1)
# x = repeat(x, 'h w -> h w c', c=3)
#
# print(x.shape)
# plt.imshow(x)
# plt.show()

在这里插入图片描述

在这里插入图片描述

总结

老老实实的迈好每一步,相信成功就在你身边,有疑惑可以在评论区留言噢! 甘愿为理想“头破血流”

引用

[1]周颖.基于信息增益的小型工业企业信用评级模型[J].运筹与管理,2021,30(01):209-216.

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值