准备工作
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
sns.set_style('darkgrid')
plt.rcParams['font.sans-serif'] = ['SimHei']
#导入数据
data1 = pd.read_excel('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/2020年B题--汽油辛烷值建模/数模题/附件一.xlsx')
data_285 = pd.read_excel('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/2020年B题--汽油辛烷值建模/数模题/285_313.xlsx',sheet_name='Sheet2')
data_313 = pd.read_excel('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/313.xlsx')
data4= pd.read_excel('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/2020年B题--汽油辛烷值建模/数模题/附件四:354个操作变量信息_改.xlsx')
第二问:
- 首先用附件四的范围值筛选出附件一354个操作变量的数据中存在异常的数据,存在异常数据的操作变量中,异常数据的数量。
standard = []
#将每个特征的取值范围提取出来,存成列表
for each in data4['取值范围']:
v = each.split('_')
standard.append(v)
standard_val = []
for row in standard:
a = []
for j in row:
a.append(eval(j))
standard_val.append(a)
standard_dic = {}
for i,each in enumerate(data4['位号']):
standard_dic[each] = standard_val[i]
standard_result = {}
#检测超出或低于特征规定范围的样本
for k,v in standard_dic.items():
col_val = data1[k]
thre1 = v[0]
thre2 = v[1]
index = []
for inx,j in enumerate(col_val):
if j>thre2 or j<thre1:
index.append(inx)
standard_result[k]=index
bad_features = {}
#提取出超出范围数不为0的特征
for k,v in standard_result.items():
if len(v) !=0:
bad_features[k]=v
数据:
- 找出含0值较多的特征,占比如下:
def missing_data(data):
"""将原始数据集中为0的值全部转为nan
Input:
data:原始数据
return:
data_:缺失值转化后的数据集
"""
columns = list(data.columns)
index_list={}
for each in columns:
index=[]
col = data[each]
for inx,v in enumerate(col):
if v == 0:
index.append(inx)
index_list[each]=index
final_index = {}
for key in index_list.keys():
if len(index_list[key]) != 0:
final_index[key] = index_list[key]
data_ = data
for each in final_index.keys():
data_[each].iloc[final_index[each]] = np.nan
return data_
if __name__ == "__main__":
Data = missing_data(data1)
print('Data Size:{}'.format(Data.shape))
print('----------------------------------------------------------------------------------------------')
print('Missing proportion:\n',Data.isnull().mean().sort_values(ascending=False).head(33))
Missing_proportion = pd.DataFrame({'Proportion':Data.isnull().mean().sort_values(ascending=False).head(32)})
plt.figure(figsize=(12,8),dpi=100)
plt.rcParams['font.sans-serif'] = ['SimHei']
sns.barplot(Missing_proportion.index,Missing_proportion.Proportion)
# plt.title('含零特征中的零值占比',fontsize=16,fontweight='bold')
plt.xlabel('特征名',fontsize=12,fontweight='bold')
plt.ylabel('比例',fontsize=12,fontweight='bold')
plt.xticks(rotation=90,fontsize=8)
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/Missing proportion.jpg')
plt.show()
- 删除含0值比例大于10%的特征和含异常值数大于5的特征,一共删除了19含0值的特征和6个含异常值的特征,一共25个。
bad_feature_names = list(bad_features.keys())
bad_feature_vals = []
for i in bad_features.values():
n = len(i)
bad_feature_vals.append(n)
bad_features_total = pd.DataFrame({'Bad Features':bad_feature_names,'Number of bad features':bad_feature_vals})
bad_features_total_sort = bad_features_total.sort_values(by='Number of bad features',ascending=False)
miss_list = list(Missing_proportion.loc[Missing_proportion['Proportion']>0.1].index)# 删除0值占10%以上的特征
outlinear = list(bad_features_total_sort.loc[bad_features_total_sort['Number of bad features']>5]['Bad Features'])
# 删除按工艺标准检测出来包含5个样本以上的特征
print('Number of missing features:',len(miss_list))
print('Number of outlinear features:',len(outlinear))
delete_feature = set([])
# 提取要删除的特征
for m in miss_list:
delete_feature.add(m)
for o in outlinear:
delete_feature.add(o)
print(len(delete_feature))
X_drop = X.drop(delete_feature,axis=1)
- 利用RFR+RFE对特征进行重要性排序
1.首先对数据进行归一化处理:
X_drop_norm = (X_drop-X_drop.mean())/X_drop.std()
归一化公式:
2.然后对342维的数据利用RFR+RFE进行降维,降到35个特征
X_train,X_test,y_train,y_test = train_test_split(X_drop_norm,Y,test_size=0.3)
RF = RandomForestRegressor(random_state=0,n_jobs=-1)
rfe = RFE(RF,n_features_to_select=35,step=1)
rfe.fit(X_train,y_train)
print('Chosen best 35 feature by rfe:',X_train.columns[rfe.support_])
features_35 = list(X_train.columns[rfe.support_])
X_norm_35 = X_drop_norm[features_35]
选出来的特征:
- 然后用Pearson相关性检验,计算35个特征中每个特征与多少个特征的相关性大于0.5,然后选择6个进行删除。
X_norm_35_corr = X_norm_35.corr()
corr = {}
X_index = list(X_norm_35_corr.index)
for inx,i in enumerate(list(X_norm_35_corr.columns)):
corr_list = []
val = X_norm_35_corr[i]
feature = X_index[inx]
for j,each in enumerate(val):
if inx!=j and each>0.5:
corr_list.append(list(X_norm_35_corr.columns)[j])
corr[i] = corr_list
final_corr = {}
for k,v in corr.items():
if len(v)>0:
final_corr[k]=v
#计算每个特征与多少个特征的相关性系数大于0.5
num_dict = {}
for k,v in final_corr.items():
n = len(v)
num_dict[k] = n
sorted_num_dict = sorted(num_dict.items(),key=lambda x:x[1],reverse=True)
选择的特征:
‘产品的辛烷值RON’;‘S-ZORB.AT-0009.DACA.PV’;‘S-ZORB.PT_7107.DACA’;‘S-ZORB.PT_7103.DACA’;‘S-ZORB.TC_2801.PV’;‘S-ZORB.FC_2801.PV’
- 获得最终数据,共29维,并检测这些特征之间的相关性
delete_features = ['辛烷值RON.1','S-ZORB.AT-0009.DACA.PV','S-ZORB.PT_7107.DACA','S-ZORB.PT_7103.DACA','S-ZORB.TC_2801.PV','S-ZORB.FC_2801.PV']
X_norm_final = X_norm_35.drop(delete_features,axis=1)
plt.figure(figsize=(24,24))
sns.heatmap(X_norm_final.corr(),square=True,linewidth=4,linecolor='black',annot_kws={'size':12})
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/最后的特征相关性.jpg')
plt.show()
- 建模
1.首先选择了6个模型来进行比较:随机森林、Ridge、Lasso、LinearRegression、Gradient Boosting Regression、SVR
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
X_train,X_test,y_train,y_test = train_test_split(X_norm_final,Y,test_size=0.2,shuffle=True)
#线性回归
lr = LinearRegression()
lr.fit(X_train,y_train)
#随机森林
rf = RandomForestRegressor(random_state=5)
rf.fit(X_train,y_train)
#LASSO:
LS = Lasso(alpha=0.0005,random_state=5)
LS.fit(X_train,y_train)
#SVR:
svr = SVR()
svr.fit(X_train,y_train)
#Ridge
ridge =Ridge(alpha=0.002,random_state=5)
ridge.fit(X_train,y_train)
#Gradient Boosting Regression
GBR =GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
max_depth=4, random_state=5)
GBR.fit(X_train,y_train)
- 用10-折交叉验证来计算每个模型的RMSE:
n_folds = 10
cross_score = {}
scores = cross_val_score(lr, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
lr_mae_scores = np.sqrt(-scores)
cross_score['LinearRegression'] =lr_mae_scores.mean().round(decimals=3)
print('For LR model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(lr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(lr_mae_scores.std().round(decimals=3)))
scores = cross_val_score(rf, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
rf_mae_scores = np.sqrt(-scores)
cross_score['RandomForest'] =rf_mae_scores.mean().round(decimals=3)
print('For RF model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(rf_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(rf_mae_scores.std().round(decimals=3)))
scores = cross_val_score(LS, X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
ls_mae_scores = np.sqrt(-scores)
cross_score['Lasso'] =ls_mae_scores.mean().round(decimals=3)
print('For LS model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(ls_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(ls_mae_scores.std().round(decimals=3)))
scores = cross_val_score(svr,X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
svr_mae_scores = np.sqrt(-scores)
cross_score['SVR'] =svr_mae_scores.mean().round(decimals=3)
print('For svr model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(svr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(svr_mae_scores.std().round(decimals=3)))
scores = cross_val_score(ridge,X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
ridge_mae_scores = np.sqrt(-scores)
cross_score['Ridge'] =ridge_mae_scores.mean().round(decimals=3)
print('For ridge model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(ridge_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(ridge_mae_scores.std().round(decimals=3)))
scores = cross_val_score(GBR, X_norm_final, Y , scoring='neg_mean_squared_error',
cv=n_folds)
gbr_mae_scores = np.sqrt(-scores)
cross_score['Gradient Boosting Regression'] =gbr_mae_scores.mean().round(decimals=3)
print('For GBR model:')
# print(lasso_mae_scores.round(decimals=2))
print('Mean RMSE = ' + str(gbr_mae_scores.mean().round(decimals=3)))
print('Error std deviation = ' +str(gbr_mae_scores.std().round(decimals=3)))
结果:
- 绘制RMSE图
model_names = list(cross_score.keys())
model_RMSE = list(cross_score.values())
plt.figure(figsize=(12,10))
sns.barplot(model_names,model_RMSE)
plt.xlabel('模型名称',fontsize=14,fontweight='bold')
plt.ylabel('RMES',fontsize=14,fontweight='bold')
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/交叉验证中各个模型的平均误差.jpg')
plt.show()
- 选择了RMSE最小的随机森林作为目标模型,进行调参
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
rmse_list = []
for i in range(100,1000,50):
final_rf = RandomForestRegressor(n_estimators=i,oob_score=True,random_state=5)
final_rf.fit(X_train,y_train)
scores = cross_val_score(final_rf, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
final_rf_mae_scores = np.sqrt(-scores)
rmse = final_rf_mae_scores.mean().round(decimals=3)
rmse_list.append(rmse)
plt.figure(figsize=(14,12))
sns.relplot(np.arange(100,1000,50),rmse_list,kind='line')
plt.xlabel('决策树个数',fontsize=14,fontweight='bold')
plt.ylabel('RMSE',fontsize=14,fontweight='bold')
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/不同数量树情况下的RMSE.jpg')
plt.show()
结果:选择1000棵树
- 构建最后的模型:
final_rf = RandomForestRegressor(n_estimators=1000,oob_score=True,random_state=5)
final_rf.fit(X_train,y_train)
scores = cross_val_score(final_rf, X_norm_final, Y, scoring='neg_mean_squared_error',
cv=n_folds)
final_rf_mae_scores = np.sqrt(-scores)
rmse = final_rf_mae_scores.mean().round(decimals=3)
print(rmse)
RMSE:
- 绘制实际与预测
pred = final_rf.predict(X_test)
plt.figure(figsize=(14,8))
plt.scatter(y_test,pred,s=20,color='blue')
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test),max(y_test)],[min(y_test),max(y_test)],color='red')
plt.savefig('D:/研究生文献/17届研究生数学建模大赛/2020年中国研究生数学建模竞赛赛题/2020年B题/数模题/图/实际与预测之间的比较.jpg')
plt.show()