scikit-learn example

Machine Learning in Python

简单例子

内容:1

  1. 数据加载
  2. 数据分割
  3. 数据预处理
  4. 模型构建和评估
  5. 交叉检验超参估计、特征选择、模型选择
  6. 保存模型和导入模型
// sklearn 
from time import time
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib

from sklearn import tree
from sklearn import svm

import matplotlib.pyplot as plt

random_state = np.random.RandomState(seed=0)


def clf_dt():
    # 加载数据
    X, y = datasets.make_hastie_10_2(n_samples=8000, random_state=42)

    # The scorers can be either be one of the predefined metric strings or a scorer
    # callable, like the one returned by make_scorer
    scoring = {'AUC': 'roc_auc', 'Accuracy': metrics.make_scorer(metrics.accuracy_score)}

    # Setting refit='AUC', refits an estimator on the whole dataset with the
    # parameter setting that has the best cross-validated AUC score.
    # That estimator is made available at ``gs.best_estimator_`` along with
    # parameters like ``gs.best_score_``, ``gs.best_params_`` and
    # ``gs.best_index_``
    tuned_parameters = {'max_depth': [2, 5, 10], 'min_samples_leaf': [3, 6, 9], 'min_samples_split': range(2, 403, 10)}
    clf = GridSearchCV(tree.DecisionTreeClassifier(random_state=random_state),
                       param_grid=tuned_parameters,
                       scoring=scoring, cv=5, refit='AUC', return_train_score=True)
    clf.fit(X, y)
    results = clf.cv_results_
    cv_result = pd.DataFrame.from_dict(results)
    with open('cv_result.csv', 'w') as f:
        cv_result.to_csv(f)
    return 0


def clf_svm():
    # 加载数据
    X, y = load_data()
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
    # 归一化数据
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_transformed = scaler.transform(x_train)
    x_test_transformed = scaler.transform(x_test)

    # 选择最优参数构建模型
    clf = svm.SVC(kernel='linear', C=0.8, probability=True, random_state=random_state)
    clf = clf.fit(x_train_transformed, y_train)
    score = clf.score(x_test_transformed, y_test)

    plot_roc(clf, x_test_transformed, y_test)

    # 保存模型
    clf_pk = pickle.dumps(clf)
    clf_r = pickle.loads(clf_pk)
    with open("clf.pickle", 'wb') as fw:
        pickle.dump(clf, fw)
    with open("clf.pickle", 'rb') as fr:
        clf_r = pickle.load(fr)
    joblib.dump(clf, 'clf.joblib')  # 保存
    clf_r = joblib.load('clf.joblib')  # 载入


    # 交叉检验超参估计、特征选择、模型选择
    cv = StratifiedKFold(n_splits=10, random_state=random_state)
    clf = svm.SVC(kernel='linear', C=1)
    # 评价标准
    scores = cross_val_score(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring="f1_macro")
    # 设置多个评价指标
    scoring = ['precision_macro', 'recall_macro']
    scores = cross_validate(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring=scoring)
    # 预测值评估
    predeicted = cross_val_predict(clf, x_train_transformed, y_train, cv=cv)
    score = metrics.accuracy_score(y_train, predeicted)

    # 模型中的超参调节
    # 交叉检验超参估计
    for c in np.linspace(0.05, 1, 10):
        clf = svm.SVC(kernel='linear', C=c)
        # cross validated finding parameters
        scores = cross_val_score(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring="f1_macro")
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    # Tuning the hyper-parameters of an estimator
    tuned_parameters = [
        {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
        {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    ]
    scores = ['precision', 'recall']
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score)
        clf.fit(x_train, y_train)
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        y_true, y_pred = y_test, clf.predict(x_test)
        print(metrics.classification_report(y_true, y_pred))
    return


def load_data():
    # 加载数据
    # iris = datasets.load_iris()
    # X = iris.data
    # y = iris.target
    # unique_lables = set(y)
    # y = preprocessing.label_binarize(y, classes=list(unique_lables))

    # 生成分类数据
    X, y = datasets.make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2,
                                        n_classes=2, n_clusters_per_class=2, random_state=random_state)
    # 加噪声
    X += 2 * random_state.uniform(size=X.shape)
    # unique_lables = set(y)
    # colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_lables)))
    # for k, col in zip(unique_lables, colors):
    #     x_k = X[y == k]
    #     plt.plot(x_k[:, 0], x_k[:, 1], 'o', markerfacecolor=col, markeredgecolor="k",
    #              markersize=12)
    # plt.title('data by make_classification()')
    # plt.show()

    return X, y


def plot_roc(clf, x_test, y_test):
    # 类别
    # y_pred = clf.predict(x_test)
    # 概率
    y_pred = clf.predict_proba(x_test)
    # 距离
    # y_pred = clf.decision_function(x_test)
    # Compute ROC curve and ROC area for each class
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred[:, 1])
    roc_auc = metrics.auc(fpr, tpr)

    # n_classes = y_test.shape[1]
    # fpr = dict()
    # tpr = dict()
    # roc_auc = dict()
    # for i in range(n_classes):
    #     fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_pred[:, i])
    #     roc_auc[i] = metrics.auc(fpr[i], tpr[i])
    #
    # # Compute micro-average ROC curve and ROC area
    # fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_pred.ravel())
    # roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
    #
    # fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
    # roc_auc = metrics.auc(fpr, tpr)

    lw = 2
    plt.figure(figsize=(10, 10))
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()


if __name__ == "__main__":
    start = time()
    clf_dt()
    # clf_svm()
    print(time() - start)
    print("END data mining")


  1. scikit-learn.org ↩︎

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值