python中使用集成模型,随机森林分类器,梯度提升决策树性能模型分析 可视化

import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#titanic = pd.read_csv('../Datasets/Breast-Cancer/titanic.txt')

X=titanic[['pclass','age','sex']]
y=titanic['survived']
X.info()
X['age'].fillna(X['age'].mean(),inplace=True)
X.info()
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
print(vec.feature_names_)
X_test=vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier() 
dtc.fit(X_train,y_train)
dtc_y_pred=dtc.predict(X_test)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_y_pred=rfc.predict(X_test)
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_y_pred = gbc.predict(X_test)



from sklearn.metrics import classification_report
print('The accuracy of decision tree is',dtc.score(X_test,y_test))

print(classification_report(dtc_y_pred,y_test))

print('The accuracy of random decision tree is',rfc.score(X_test,y_test))

print(classification_report(rfc_y_pred,y_test))
      

print('The accuracy of gradient forest tree is',gbc.score(X_test,y_test))

print(classification_report(gbc_y_pred,y_test))



from matplotlib import pyplot as plt
import numpy as np 

def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: https://stackoverflow.com/a/25074150/395857 
    By HYRY
    '''
    global zip
    import  itertools
    zip = getattr(itertools, 'izip', zip)
    pc.update_scalarmappable()
    ax = pc.axes
    for p, color, value in  zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: https://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
    '''
    Inspired by:
    - https://stackoverflow.com/a/16124677/395857 
    - https://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    #c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()       

    # resize 
    fig = plt.gcf()
    #fig.set_size_inches(cm2inch(40, 20))
    #fig.set_size_inches(cm2inch(40*4, 20*4))
    fig.set_size_inches(cm2inch(figure_width, figure_height))



def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
    '''
    Plot scikit-learn classification report.
    Extension based on https://stackoverflow.com/a/31689645/395857 
    '''
    lines = classification_report.split('\n')

    classes = []
    plotMat = []
    support = []
    class_names = []
    for line in lines[2 : (len(lines) - 2)]:
        t = line.strip().split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        print(v)
        plotMat.append(v)

    print('plotMat: {0}'.format(plotMat))
    print('support: {0}'.format(support))

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup  in enumerate(support)]
    figure_width = 25
    figure_height = len(class_names) + 7
    correct_orientation = False
    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap)

#传入相应的report结果
def main():
    sampleClassificationReport =classification_report(dtc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport)
    plt.savefig('decision_tree_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

    sampleClassificationReport1 =classification_report(rfc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport1)
    plt.savefig('radom_forest_classifier_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

    sampleClassificationReport2 =classification_report(gbc_y_pred,y_test)
    plot_classification_report(sampleClassificationReport2)
    plt.savefig('gradient_tree_classifier_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling



输出结果如下:
 File "D:\Python35\lib\urllib\request.py", line 1256, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>
修改数据连接文件:
titanic = pd.read_csv('../Datasets/Breast-Cancer/titanic.txt')

最后输出结果如下:

The accuracy of decision tree is 0.7811550151975684
             precision    recall  f1-score   support

          0       0.91      0.78      0.84       236
          1       0.58      0.80      0.67        93

avg / total       0.81      0.78      0.79       329

The accuracy of random decision tree is 0.7781155015197568
             precision    recall  f1-score   support

          0       0.90      0.78      0.83       233
          1       0.59      0.78      0.67        96

avg / total       0.81      0.78      0.79       329

The accuracy of gradient forest tree is 0.790273556231003
             precision    recall  f1-score   support

          0       0.92      0.78      0.84       239
          1       0.58      0.82      0.68        90

avg / total       0.83      0.79      0.80       329

可视化分析如下图所示:


                
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
好的,那我可以为您提供一些使用Python、Tensorflow、sklearn、numpy、pytorch和matplotlib等工具库进行机器学习项目开发时所需的算法实现示例和建议。 1. 最小二乘法:最小二乘法是一种常见的线性回归算法,可以用来拟合数据集并预测新的数据点。在Python,可以使用NumPy库实现最小二乘法。例如: ```python import numpy as np # 构造数据集 x = np.arange(0, 10, 0.1) y = 2 * x + 1 + np.random.randn(len(x)) # 最小二乘法拟合 A = np.vstack([x, np.ones(len(x))]).T w = np.linalg.lstsq(A, y, rcond=None)[0] # 输出拟合结果 print('斜率:', w[0]) print('截距:', w[1]) ``` 2. 梯度下降法:梯度下降法是一种常见的优化算法,可以用于训练神经网络等模型的参数。在Python,可以使用TensorFlow库实现梯度下降法。例如: ```python import tensorflow as tf import numpy as np # 构造数据集 x = np.random.randn(100) y = 3 * x + 2 + np.random.randn(100) * 0.5 # 定义模型参数 w = tf.Variable(0.0) b = tf.Variable(0.0) # 定义损失函数和优化器 loss = tf.reduce_mean(tf.square(y - w * x - b)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss) # 训练模型 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(100): sess.run(optimizer) if i % 10 == 0: print('第 %d 次训练,损失值为:%.4f,斜率为:%.4f,截距为:%.4f' % (i, sess.run(loss), sess.run(w), sess.run(b))) ``` 3. 分类:分类是机器学习的一个重要任务,可以用于识别图像、语音等数据。在Python,可以使用Scikit-learn库实现分类算法。例如: ```python from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0) # 定义模型并训练 clf = LogisticRegression() clf.fit(X_train, y_train) # 预测并评估模型 y_pred = clf.predict(X_test) print('准确率为:%.4f' % accuracy_score(y_test, y_pred)) ``` 4. 聚类:聚类是一种无监督学习算法,可以将数据集的样本分为若干个类别。在Python,可以使用Scikit-learn库实现聚类算法。例如: ```python from sklearn.datasets import make_blobs from sklearn.cluster import KMeans import matplotlib.pyplot as plt # 生成数据集 X, y = make_blobs(n_samples=100, centers=3, random_state=0) # 定义模型并训练 clf = KMeans(n_clusters=3) clf.fit(X) # 可视化结果 plt.scatter(X[:, 0], X[:, 1], c=clf.labels_) plt.scatter(clf.cluster_centers_[:, 0], clf.cluster_centers_[:, 1], marker='x', color='r') plt.show() ``` 5. 支持向量机:支持向量机是一种常见的分类算法,可以用于处理线性和非线性分类问题。在Python,可以使用Scikit-learn库实现支持向量机算法。例如: ```python from sklearn.datasets import make_classification from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # 生成样本数据 X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, random_state=0) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 定义模型并训练 clf = SVC(kernel='rbf') clf.fit(X_train, y_train) # 预测并评估模型 y_pred = clf.predict(X_test) print('准确率为:%.4f' % accuracy_score(y_test, y_pred)) ``` 6. 决策树决策树是一种常见的分类和回归算法,可以用于处理离散型和连续型的数据。在Python,可以使用Scikit-learn库实现决策树算法。例如: ```python from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # 加载数据集 iris = load_iris() # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=0) # 定义模型并训练 clf = DecisionTreeClassifier() clf.fit(X_train, y_train) # 预测并评估模型 y_pred = clf.predict(X_test) print('准确率为:%.4f' % accuracy_score(y_test, y_pred)) ``` 7. 神经网络:神经网络是一种常见的深度学习算法,可以用于处理图像、语音等复杂的数据。在Python,可以使用TensorFlow或PyTorch库实现神经网络算法。例如: ```python import tensorflow as tf import numpy as np # 构造数据集 X = np.random.randn(100, 10) y = np.random.randint(0, 2, size=(100, 1)) # 定义模型参数 inputs = tf.keras.layers.Input(shape=(10,)) x = tf.keras.layers.Dense(64, activation='relu')(inputs) x = tf.keras.layers.Dense(32, activation='relu')(x) outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x) # 定义模型并训练 model = tf.keras.models.Model(inputs=inputs, outputs=outputs) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X, y, epochs=10, batch_size=32) # 预测并评估模型 X_test = np.random.randn(10, 10) y_pred = model.predict(X_test) print(y_pred) ``` 以上是一些使用Python、Tensorflow、sklearn、numpy、pytorch和matplotlib等工具库进行机器学习项目开发时所需的算法实现示例和建议,希望对您有所帮助。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值