提供四种预测模型的方案和评估结果:KNN、逻辑回归、决策树分类、voting模型
# 1.导入数据,将特征和预测值进行分离,确定x,y值
import numpy as np
import pandas as pd
worker = pd.read_csv('C:\\Users\\Liu\\Desktop\\data.csv')
x = worker.drop(columns = ['Attrition'])
y = worker['Attrition']
# 2.数据分割
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 666)
# 3.分别导入网格搜索调优的KNN,搜索调优的逻辑回归,搜索调优的决策树模型
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#(1)网格搜索调优的KNN
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
param_knn = {
'n_neighbors':[i for i in range(1,7)]
}
knn_grid = GridSearchCV(knn_clf,param_knn)
knn_grid.fit(x_train,y_train)
knn_grid.score(x_test,y_test)
knn_best_model = knn_grid.best_estimator_
knn_grid.best_score_
# predict_knn = knn_grid.predict(x_test)
# accuracy_score(y_test,predict_knn)
# (2)网格搜索调优的逻辑回归
lr_clf = LogisticRegression()
from sklearn.model_selection import GridSearchCV
param_lr = [
{
'C':[0.0001, 0.01, 0.1, 1],
'penalty':['l1'],
'solver':['liblinear']
},
{
'C':[0.0001, 0.01, 0.1, 1],
'penalty':['l2'],
'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
]
lr_grid = GridSearchCV(lr_clf,param_lr)
lr_grid.fit(x_train,y_train)
lr_best_model = lr_grid.best_estimator_
lr_grid.best_score_
# (3)网格搜索调优的决策树模型
dt_clf = DecisionTreeClassifier(random_state=30,splitter='random',criterion = 'gini')
from sklearn.model_selection import GridSearchCV
param_dt = {
'criterion':['gini', 'entropy'],
'max_depth':[i for i in range(2,10)],
'min_samples_leaf':[i for i in range(1,10)],
'min_samples_split': [i for i in range(2, 10)]
}
dt_grid = GridSearchCV(dt_clf,param_grid=param_dt,cv = 8)
dt_grid.fit(x_train,y_train)
dt_grid.score(x_test,y_test)
dt_best_model = dt_grid.best_estimator_
dt_grid.best_score_
4.以上三个模型为基学习器,建立 voting 模型,并网格搜索调优
from sklearn.ensemble import VotingClassifier
voting_param = {
'voting':['soft', 'hard']
}
voting_clf = VotingClassifier(
estimators=[
('knn_clf',knn_best_model),
('lr_clf',lr_best_model),
('dt_clf',dt_best_model )
]
)
model = GridSearchCV(voting_clf, cv=10, param_grid=voting_param)
model.fit(x_train,y_train)
model.best_score_