Titanic: Machine Learning from Disaster_02

二、模型的建立

1 初步建模结果

1.1 使用cross-validation评估训练结果 & 得到测试结果

from sklearn.model_selection import cross_val_score  # cross-validation

from sklearn.linear_model import LogisticRegression  # 逻辑回归
from sklearn.tree import DecisionTreeClassifier  # 决策树
from sklearn.ensemble import RandomForestClassifier  # 随机森林
from sklearn.neighbors import KNeighborsClassifier  # K近邻
from sklearn.naive_bayes import GaussianNB  # 朴素贝叶斯
from sklearn.svm import SVC  # 支持向量机
from sklearn.ensemble import GradientBoostingClassifier  # 梯度提升

models=[
        LogisticRegression(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        KNeighborsClassifier(),
        GaussianNB(),
        SVC(),
        GradientBoostingClassifier()
        ]

r1 = []
names=['lr','dtc','rfc','knn','gnb','svc','gbc']
for name, model in zip(names,models):
    score=cross_val_score(model,x_train_s,y_train,cv=5)  # 5折交叉验证评估
    r1.append(score.mean()) 
    # —————————————————————————————————————————————————————————————————————————
    # 记录树模型,后面查看特征重要性使用
    if name == 'dtc':
        dtc = model.fit(x_train_s,y_train)
    elif name == 'rfc':
        rfc = model.fit(x_train_s,y_train)
    elif name == 'gbc':
        gbc = model.fit(x_train_s,y_train)
    # —————————————————————————————————————————————————————————————————————————
train_result_1 = pd.DataFrame(r1,columns=['train_result'], index=names)
train_result_1

1.2 使用3种树模型中的(特征重要性)均值,来查看哪些特征很重要

a1=pd.DataFrame({'importance':dtc.feature_importances_},index=x_train_s.columns).sort_values('importance',ascending=False)
a2=pd.DataFrame({'importance':rfc.feature_importances_},index=x_train_s.columns).sort_values('importance',ascending=False)
a3=pd.DataFrame({'importance':gbc.feature_importances_},index=x_train_s.columns).sort_values('importance',ascending=False)
a=(a1+a2+a3)/3
a.sort_values('importance',ascending=False,inplace=True)

a.plot.bar(figsize=(10,6))
plt.xticks(rotation=45)
plt.title('Feature Importance')

 

2 超参数调优

from sklearn.model_selection import GridSearchCV  # 利用网格搜索和交叉验证(超参数调优)

# 1 LogisticRegression
param_lr = {'C':[0.1,1,10]}
gscv_lr = GridSearchCV(LogisticRegression(),param_grid=param_lr,cv=10)  # 10折交叉验证

gscv_lr.fit(x_train_s,y_train)
print("lr:最佳准确率:{},最佳参数:{}".format(gscv_lr.best_score_, gscv_lr.best_params_,))

# 3 RandomForestClassifier 
param_rfc = {
    "n_estimators": [120, 200, 300],
    "max_depth": [5, 8]
    }
gscv_rfc = GridSearchCV(RandomForestClassifier(),param_grid=param_rfc,cv=10)

gscv_rfc.fit(x_train_s,y_train)
print("rfc:最佳准确率:{},最佳参数:{}".format(gscv_rfc.best_score_, gscv_rfc.best_params_,))

# 4 k-Nearest Neighbors 
param_knn = {'n_neighbors':[1,2,3,4,5,6,7,8,9]}
gscv_knn=GridSearchCV(KNeighborsClassifier(),param_grid=param_knn,cv=10)

gscv_knn.fit(x_train_s,y_train)
print("knn:最佳准确率:{},最佳参数:{}".format(gscv_knn.best_score_, gscv_knn.best_params_,))

# 6 Support Vector Machine 
param_svc={'C':[0.1,1,10]}
gscv_svc=GridSearchCV(SVC(),param_grid=param_svc,cv=10)

gscv_svc.fit(x_train_s,y_train)
print("svc:最佳准确率:{},最佳参数:{}".format(gscv_svc.best_score_, gscv_svc.best_params_,))

# 7 Gradient Boosting Decision Tree 
param_gbdt={'n_estimators':[80,120,200],
            'learning_rate':[0.05,0.1,0.5],
            'max_depth':[3,4,5]
            }
gscv_gbdt=GridSearchCV(GradientBoostingClassifier(),param_grid=param_gbdt,cv=10)

gscv_gbdt.fit(x_train_s,y_train)
print("gbdt:最佳准确率:{},最佳参数:{}".format(gscv_gbdt.best_score_, gscv_gbdt.best_params_,))

3 运用最佳参数建模,并使用集成算法:投票器

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score  # 计算准确率
from sklearn.metrics import classification_report   # 分类问题的评估“召回率”

lr_clf = LogisticRegression(C=1)
dtc_clf = DecisionTreeClassifier()
rfc_clf = RandomForestClassifier(max_depth=8, n_estimators=120)
knn_clf = KNeighborsClassifier(n_neighbors=7)
gnb_clf = GaussianNB()
svc_clf = SVC(C=1)
gbc_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=120)

es = [('lr',lr_clf),('dtc',dtc_clf),('rfc',rfc_clf),('knn',knn_clf),('gnb',gnb_clf),('svc',svc_clf),('gbc',gbc_clf)]
vote_clf = VotingClassifier(estimators=es, voting='hard')
vote_clf.fit(x_train_s, y_train)

yp_vote = vote_clf.predict(x_train_s)  # 对自身预测
print(classification_report(y_train,yp_vote))  # 预测情况

r2 = []
indexs=[]
for clf in (lr_clf,dtc_clf,rfc_clf,knn_clf,gnb_clf,svc_clf,gbc_clf,vote_clf):
    clf.fit(x_train_s, y_train)
    y_pred_train = clf.predict(x_train_s)
    
    r2.append(accuracy_score(y_train, y_pred_train))
    indexs.append(clf.__class__.__name__)
train_result_2 = pd.DataFrame(r2)
train_result_2.columns = ['train_result']
train_result_2.index = indexs   
train_result_2

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值