Python机器学习 集成算法实例

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

SEED = 222
np.random.seed(SEED)

df = pd.read_csv('input.csv')
#切分训练集和测试集
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def get_train_test(test_size = 0.95):
    y = 1 * (df.cand_pty_affiliation == "REP")
    X = df.drop(["cand_pty_affiliation"], axis = 1)
    X = pd.get_dummies(X, sparse = True)
    #对样本的特征进行独热编码“one-hot encoding”
    X.drop(X.columns[X.std() == 0], axis = 1, inplace = True)
    #去掉标准差=0 即该特征所有样本都一样的列
    return train_test_split(X,y,test_size = test_size, random_state = SEED)

xtrain, xtest, ytrain, ytest = get_train_test()

print("\nExample data:")
df.head()

 

cand_pty_affiliation:我们要预测的指标,共和党或者民主党
entity_tp:个人还是组织
classification:领域
rpt_tp:贡献的大小
cycle:捐赠在哪年
transaction_amt:捐献金额
df.cand_pty_affiliation.value_counts(normalize = True).plot(
    kind  = "bar", title = "Share of No. donations")
plt.show()
#这里看一下原始数据正例和负例的比例,这里对应的是民主党和共和党 

 


import pydotplus
#导入结构化图形绘制工具
from IPython.display import Image
#导入图片显示的库,能够打开图片文件在jupyter中进行显示
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
#导入决策树模型和绘制决策树.dot文件的库

def print_graph(clf, feature_names):
    "打印决策树"
    graph = export_graphviz(
        clf,
        label = 'root',
        proportion = True,
        impurity = False,
        out_file = None,
        feature_names = feature_names,
        class_names = {0: "D", 1: "R"},
        filled = True,
        rounded = True)
    
    graph = pydotplus.graph_from_dot_data(graph)
    #graph_from_dot_data(数据)按dot格式数据定义的加载图。数据假定为点格式。它将被解析后,
    #将返回一个点类,代表图。
    return Image(graph.create_png())


t1 = DecisionTreeClassifier(max_depth = 1, random_state = SEED)
#构建决策树模型
t1.fit(xtrain, ytrain)
#对已经切分的训练集和测试集进行决策树模型拟合
p = t1.predict_proba(xtest)[:,1]
#对拟合后的t1进行预测,这里返回的是预测值为共和党、民主党这个二维数据的第二维的全部数据,这里是数据为共和党的概率
print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
print_graph(t1, xtrain.columns)
   

 

这里最后预测结果都是民主党,结果都是一样的没啥用,接下来对预剪枝参数进行调整 

t2 = DecisionTreeClassifier(max_depth = 3, random_state = SEED)
#将决策树深度调整为3其余的参数不变得到新的决策树
t2.fit(xtrian, ytrain)
p = t2.predict_proba(xtest)[:, 1]

print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
print_graph(t2, xtrain.columns)

 

47.3%的样本落到了最左边, 还有35.9% 落在了基本最右边. 这看起来模型基本已经过拟合了。

我们来调整下策略,去掉个对结果有着最大影响的因素再来看看!

drop = ['transaction_amt']
#去掉应最大的特征“捐献金额”
xtrain_slim = xtrain.drop(drop, 1)
xtest_slim = xtest.drop(drop, 1)

t3 = DecisionTreeClassifier(max_depth = 3, random_state = SEED)
t3.fit(xtrain_slim, ytrain)
p = t3.predict_proba(xtest_slim)[:,1]

print("Decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
print_graph(t3, xtrain_slim.columns)

p1 = t2.predict_proba(xtest)[:, 1]
p2 = t3.predict_proba(xtest_slim)[:,1]
p = np.mean([p1,p2],axis = 0)
print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

 

 

整了个平均还真比原来高了! 这么一说,应该是选择不同的特征会产生不同的结果,然后用不同的结果再进行组合得到了一个升华!那我们多选几组不就是随机森林了嘛! 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators = 10,
    max_features = 3,
    random_state = SEED
)
#estimators 随机森林树的个数,就是搞了多少个决策树
#max_features 每个决策树所需要的考虑的决策特征的个数

rf.fit(xtrain,ytrain)
p = rf.predict_proba(xtest)[:,1]
print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))


 

#这里把sklearn的算法全部押上
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline

def get_models():
    "构建起一个包含了上述算法的一个集合"
    nb = GaussianNB()
    svc = SVC(C = 100, probability = True)
    knn = KNeighborsClassifier(n_neighbors = 3)
    lr = LogisticRegression(C = 100, random_state = SEED)
    nn = MLPClassifier((80,10),early_stopping = False, random_state = SEED)
    gb = GradientBoostingClassifier(n_estimators = 10, max_features =3, random_state = SEED)
    rf = RandomForestClassifier(n_estimators = 10, max_features =3, random_state = SEED)
    
    models = {'svm':svc,
             'knn':knn,
             'naive bayes':nb,
             'mlp-nn':nn,
             'random forest': rf,
             'gbm':gb,
             'logistic': lr,
             }
    return models

def train_predict(model_list):
    "使用上述模型算法多测试集和训练集数据进行拟合并获得其预测的概率值"
    P = np.zeros((ytest.shape[0], len(model_list)))
    #构建一个和0矩阵,行和样本数一样,列的数目是用到的sklearn算法的总数
    P = pd.DataFrame(P)
    
    print("Fitting models.")
    cols = list()
    #通过一个for循环对之前构建的算法的字典进行调用,先拟合模型,然后将预测的概率结果赋值到P矩阵中对应的位置4
    #并依次保存算法的name到cols列表中,最后作为P的属性(就是DataFrame的第一列)
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(xtrain, ytrain)
        P.iloc[:, i] = m.predict_proba(xtest)[:, 1]
        cols.append(name)
        print("done")
        
    P.columns = cols
    print("Done.\n")
    return P

def score_models(P, y):
    "对模型预测结果与测试集数据进行比较利用ROC-AUC评价指标进行打分"
    print("Scoring models.")
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        print("%-26s: %.3f" % (m, score))
    print("Done.\n")

 models = get_models()
#调用get_models获得所有的模型集合组成的一个字典models
P = train_predict(models)
#依次利用每个算法模型进行拟合预测获得P矩阵中包含的每种模型算法的预测结果
score_models(P,ytest)
#使用ROC-AUC评价指标对P矩阵中模型预测结果与实际测试集进行比较

 

 

#导入混淆矩阵可视化的库
from mlens.visualization import corrmat

corrmat(P.corr(), inflate = False)
plt.show()

 预测的结果很多都是高度相关的!

 

 print("Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.mean(axis = 1)))

from sklearn.metrics import roc_curve

def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
    """Plot the roc curve for base learners and ensemble."""
    plt.figure(figsize=(10, 8))
    plt.plot([0, 1], [0, 1], 'k--')
    
    cm = [plt.cm.rainbow(i)
      for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
    
    for i in range(P_base_learners.shape[1]):
        p = P_base_learners[:, i]
        fpr, tpr, _ = roc_curve(ytest, p)
        plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])
    #绘制单个算法模型的曲线结果
    
    fpr, tpr, _ = roc_curve(ytest, P_ensemble)
    plt.plot(fpr, tpr, label=ens_label, c=cm[0])
    #绘制集成算法的曲线
        
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(frameon=False)
    plt.show()


plot_roc_curve(ytest, P.values, P.mean(axis=1), list(P.columns), "ensemble")

 

  • 7
    点赞
  • 48
    收藏
    觉得还不错? 一键收藏
  • 72
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 72
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值