scikit-learn
Machine Learning in Python
简单例子
内容:1
- 数据加载
- 数据分割
- 数据预处理
- 模型构建和评估
- 交叉检验超参估计、特征选择、模型选择
- 保存模型和导入模型
// sklearn
from time import time
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.externals import joblib
from sklearn import tree
from sklearn import svm
import matplotlib.pyplot as plt
random_state = np.random.RandomState(seed=0)
def clf_dt():
# 加载数据
X, y = datasets.make_hastie_10_2(n_samples=8000, random_state=42)
# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'AUC': 'roc_auc', 'Accuracy': metrics.make_scorer(metrics.accuracy_score)}
# Setting refit='AUC', refits an estimator on the whole dataset with the
# parameter setting that has the best cross-validated AUC score.
# That estimator is made available at ``gs.best_estimator_`` along with
# parameters like ``gs.best_score_``, ``gs.best_params_`` and
# ``gs.best_index_``
tuned_parameters = {'max_depth': [2, 5, 10], 'min_samples_leaf': [3, 6, 9], 'min_samples_split': range(2, 403, 10)}
clf = GridSearchCV(tree.DecisionTreeClassifier(random_state=random_state),
param_grid=tuned_parameters,
scoring=scoring, cv=5, refit='AUC', return_train_score=True)
clf.fit(X, y)
results = clf.cv_results_
cv_result = pd.DataFrame.from_dict(results)
with open('cv_result.csv', 'w') as f:
cv_result.to_csv(f)
return 0
def clf_svm():
# 加载数据
X, y = load_data()
# 分割数据
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)
# 归一化数据
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_transformed = scaler.transform(x_train)
x_test_transformed = scaler.transform(x_test)
# 选择最优参数构建模型
clf = svm.SVC(kernel='linear', C=0.8, probability=True, random_state=random_state)
clf = clf.fit(x_train_transformed, y_train)
score = clf.score(x_test_transformed, y_test)
plot_roc(clf, x_test_transformed, y_test)
# 保存模型
clf_pk = pickle.dumps(clf)
clf_r = pickle.loads(clf_pk)
with open("clf.pickle", 'wb') as fw:
pickle.dump(clf, fw)
with open("clf.pickle", 'rb') as fr:
clf_r = pickle.load(fr)
joblib.dump(clf, 'clf.joblib') # 保存
clf_r = joblib.load('clf.joblib') # 载入
# 交叉检验超参估计、特征选择、模型选择
cv = StratifiedKFold(n_splits=10, random_state=random_state)
clf = svm.SVC(kernel='linear', C=1)
# 评价标准
scores = cross_val_score(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring="f1_macro")
# 设置多个评价指标
scoring = ['precision_macro', 'recall_macro']
scores = cross_validate(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring=scoring)
# 预测值评估
predeicted = cross_val_predict(clf, x_train_transformed, y_train, cv=cv)
score = metrics.accuracy_score(y_train, predeicted)
# 模型中的超参调节
# 交叉检验超参估计
for c in np.linspace(0.05, 1, 10):
clf = svm.SVC(kernel='linear', C=c)
# cross validated finding parameters
scores = cross_val_score(clf, x_train_transformed, y_train, cv=cv, n_jobs=-1, scoring="f1_macro")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# Tuning the hyper-parameters of an estimator
tuned_parameters = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score)
clf.fit(x_train, y_train)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
y_true, y_pred = y_test, clf.predict(x_test)
print(metrics.classification_report(y_true, y_pred))
return
def load_data():
# 加载数据
# iris = datasets.load_iris()
# X = iris.data
# y = iris.target
# unique_lables = set(y)
# y = preprocessing.label_binarize(y, classes=list(unique_lables))
# 生成分类数据
X, y = datasets.make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2,
n_classes=2, n_clusters_per_class=2, random_state=random_state)
# 加噪声
X += 2 * random_state.uniform(size=X.shape)
# unique_lables = set(y)
# colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_lables)))
# for k, col in zip(unique_lables, colors):
# x_k = X[y == k]
# plt.plot(x_k[:, 0], x_k[:, 1], 'o', markerfacecolor=col, markeredgecolor="k",
# markersize=12)
# plt.title('data by make_classification()')
# plt.show()
return X, y
def plot_roc(clf, x_test, y_test):
# 类别
# y_pred = clf.predict(x_test)
# 概率
y_pred = clf.predict_proba(x_test)
# 距离
# y_pred = clf.decision_function(x_test)
# Compute ROC curve and ROC area for each class
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred[:, 1])
roc_auc = metrics.auc(fpr, tpr)
# n_classes = y_test.shape[1]
# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(n_classes):
# fpr[i], tpr[i], _ = metrics.roc_curve(y_test[:, i], y_pred[:, i])
# roc_auc[i] = metrics.auc(fpr[i], tpr[i])
#
# # Compute micro-average ROC curve and ROC area
# fpr["micro"], tpr["micro"], _ = metrics.roc_curve(y_test.ravel(), y_pred.ravel())
# roc_auc["micro"] = metrics.auc(fpr["micro"], tpr["micro"])
#
# fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
# roc_auc = metrics.auc(fpr, tpr)
lw = 2
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
if __name__ == "__main__":
start = time()
clf_dt()
# clf_svm()
print(time() - start)
print("END data mining")