一、进行数据读取
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import fill_data#前文用于处理数据集的py文件
'''数据提取'''
train_data = pd.read_excel(r'.//temp_data//训练数据集.xlsx')#读取训练数据集
train_data_x= train_data.iloc[:,1:]#训练数据集的特征
train_data_y =train_data.iloc[:,0]#训练数据集的测试标签label
test_data = pd.read_excel(r'.//temp_data//测试数据集.xlsx')
test_data_x= test_data.iloc[:,1:]
test_data_y =test_data.iloc[:,0]
result_data = {}#用来保存后面6种模型的结果
二、通过网格搜索寻找参数的最优解(GNB除外)
以随机森林为例
一、导入模型函数(需更改)
from sklearn.ensemble import RandomForestClassifier
二、导入网格搜索函数 (通用)
from sklearn.model_selection import GridSearchCV
三、输入参数范围(需更改,可自行搜索)
param_grid ={
'n_estimators':[50,100,200],
'max_depth':[None,10,20,30],
'min_samples_split':[2,5,10],
'min_samples_leaf':[1,2,5],
'max_features':['auto','sqrt','log2'],
'bootstrap':[True,False]
}
四、使用模型 (与导入模型相同)
logreg =RandomForestClassifier()#括号内可自行加入部分参数
五、进行网格搜索(通用)
grid_search=GridSearchCV(logreg,param_grid,cv=5)
grid_search.fit(train_data_x,train_data_y)
print("最佳参数:")
print(grid_search.best_params_)
得到的最佳参数如下
逻辑回归,SVM算法,AdaBoost算法同理,下列分别是这三者的对应模型函数与参数
逻辑回归:
from sklearn.linear_model import LogisticRegression
param_grid ={
'penalty':['l1','l2','elasticnet','none'],
'C':[0.001,0.01,0.1,1,10,100],
'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
'max_iter':[100,200,500],
'multi_class':['auto','ovr','multinomial']
}
SVM算法
from sklearn.svm import SVC
param_grid ={
'C':[0.01,0.1,1,2],
'kernel':['linear','poly','rbf','sigmoid'],
'degree':[2,3,4,5],
'gamma': ['scale','auto']+[ 1],
'coef0':[ 0.1],
}
AdaBoost算法
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
param_grid ={
'n_estimators':[50,100,200],
'learning rate':[0.01,0.1,0.5,1.0],
'alqorithm':['SAMME','SAMME.R'],
'base_estimator': [DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2)]
}
XGBoost算法需额外设定大量模型参数,在此单独展示完整代码
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
param_grid ={
'learning_rate':[0.01,0.05,0.1],
'n_estimators':[50,100,200]
'max_depth':[3,5,7],
'min_child_weight':[1,3,5],
'gamma':[0,0.1,0.2],
'subsample':[0.6,0.8,1.0],
'colsample_bytree':[0.6,0.8,1.0]
}
model = xgb.XGBClassifier(
learning_rate=0.1,
n_estimators=100,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weiqht=1,
seed=27 )
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)
grid_search.fit(train_data_x, train_data_y)
print("Best parameters found: ", grid_search.best_params_)
print("Highest accuracy found: ", grid_search.best_score_)
在获得最佳参数后可将除导入模型以外的代码全部取消使用,网格搜索会花费大量时间
三、 储存训练结果
依旧以随机森林为例
RF_result={}#保存训练之后的结果,
rf = RandomForestClassifier(n_estimators=100,
max_depth=20,max_features="auto",
min_samples_leaf=1,min_samples_split=2)#通过网格搜索得出的最佳参数
rf.fit(train_data_x,train_data_y)
train_predicted = rf.predict(train_data_x)
print('RF的train:\n',metrics.classification_report(train_data_y, train_predicted))
test_predicted = rf.predict(test_data_x)
print('RF的test:\n',metrics.classification_report(test_data_y,test_predicted))
a= metrics.classification_report(test_data_y,test_predicted,digits=6)#digits表示保留有效位数
b = a.split()
RF_result['recall 0']= float(b[6])#添加类别为0的召回率
RF_result['recall 1']= float(b[11])
RF_result['recall 2']= float(b[16])
RF_result['recall 3']= float(b[21])
RF_result['acc']= float(b[25])#添加accuracy的结果
result_data['RF']= RF_result#将结果储存到总体的结果中
print('rf结束')
其余几种都是与上文类似,将用于区别模型名字的RF与搜索得到的最佳参数更改即可。
GNB加上导入模型即可
from sklearn.naive_bayes import GaussianNB
四、将得到的结果保存为json文件
import json
result = {}
result['mode fill']= result_data
with open(r'temp_data/训练方式.json','w',encoding='utf-8') as file:
json.dump(result, file, ensure_ascii=False, indent=4)
运用同一个数据集将其进行6种类型填充后保存结果如下
五、结果通过绘图展示
import matplotlib.pyplot as plt
import json
"""创建一个新的图形(6中算法的测试集的结果对比图)"""
#打开文件并读取JSON数据
ls =['随机森林填充.json','线性回归填充.json','平均值填充.json','中位数填充.json','众数填充.json','空数据删除.json']
d={}
for file_name in ls:
path = './/temp_data//' + file_name
with open(path, 'r', encoding='utf-8') as file:
data = json.load(file)
k = 0
for i in data.items():
for j in i[1].items():
if k == 0:
d[i[0]] = [j[1]['acc']]
k += 1
else:
a = [j[1]['acc']]
d[i[0]] = d[i[0]] + [j[1]['acc']]
k += 1
print(d)
X=['LR','RF','SVM','AdaBoost' ,'XGBoost','GNB']
plt.figure()
for i in d.items():
#绘制折线图
plt.plot(X,i[1],label = i[0])
plt.yticks([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
#添加标题和铀标签
plt.title('Rsults without data augmentation')
plt.xlabel('Train using 6 algorithms')
plt.ylabel('Accuracy rate of the model after testing')
plt.legend()
plt.show()