1. sklearn.metrics.make_scorer
1.1 参数解释
'''
score_func
记分函数 如sklearn.metrics.accuracy_score
损失函数 如sklearn.metrics.mean_squared_error
greater_is_better
True(默认) 直译为值越大越好,score_func是记分函数时取True
False score_func为损失函数时值越小拟合情况越好,得到的对象会对结果进行翻转
needs_proba
True 输出概率
False(默认) 不输出概率
needs_threshold
True 用于二分类
False(默认)
'''
sklearn.metrics.make_scorer(score_func, greater_is_better=True, needs_proba=False, needs_threshold=False)
1.2 用于随机/网格搜索 sklearn.model_selection.RandomizedSearchCV/GridSearchCV(scoring=make_scorer(*))
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
import pandas as pd
def smape(y_true, y_pred):
smap = np.zeros(len(y_true))
num = np.abs(y_true - y_pred)
dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
pos_ind = (y_true!=0)|(y_pred!=0)
smap[pos_ind] = num[pos_ind] / dem[pos_ind]
return 100 * np.mean(smap)
rfc = RandomForestRegressor()
forest_params = [{'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}]
cv = KFold(n_splits=10, shuffle=True, random_state=42)
clf = RandomizedSearchCV(rfc, forest_params, cv = cv, scoring=make_scorer(smape), verbose = -1)
1.3 用于交叉验证 sklearn.model_selection.cross_validate(scoring=make_scorer(*))
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
liner_model = LinearRegression()
scoring = {
'customize_score': my_scorer
}
kfold = KFold(n_splits=10, random_state=0)
cv_cross = cross_validate(liner_model, x_train_std, y_train, cv=kfold, scoring=scoring)
print(cv_cross['test_customize_score'].mean())
print(cv_cross['test_customize_score'].std())
2. sklearn.metrics.f1_score/recall_score/precision_score
pre = model.predict(X_test, batch_size=batch_size)
pre, y_test
'''
array([[0.18804531, 0.3357192 , 0.47623548],
[0.30215347, 0.36785322, 0.32999334],
[0.18804531, 0.3357192 , 0.47623548],
[0.18804531, 0.3357192 , 0.47623548],
[0.18804531, 0.3357192 , 0.47623548],
[0.30215347, 0.36785322, 0.32999334],
[0.30215347, 0.36785322, 0.32999334],
[0.18804531, 0.3357192 , 0.47623548],
[0.30215347, 0.36785322, 0.32999334]])
array([[1., 0., 0.],
[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.],
[1., 0., 0.],
[0., 1., 0.]])
'''
for idx1 in range(len(pre)):
max_val = max(pre[idx1])
for idx2 in range(len(pre[idx1])):
if max_val == pre[idx1][idx2]:
pre[idx1][idx2] = 1
else:
pre[idx1][idx2] = 0
from sklearn.metrics import f1_score, recall_score, precision_score
f1 = f1_score(y_true=y_test, y_pred=pre, average='weighted')
recall = recall_score(y_true=y_test, y_pred=pre, average='weighted')
precision = precision_score(y_true=y_test, y_pred=pre, average='weighted')
print(" f1: %f — precision: %f — recall: %f" % (f1, precision, recall))
3. sklearn.metrics.confusion_matrix
from sklearn.metrics import confusion_matrix
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
'''
输出混淆矩阵
[i, j] 表示真实标签是i但预测标签是j的样本个数
normalize
None 默认,输出的每个元素是个数
‘pred’ 输出概率,每个元素个数除以列(预测)
'true' 输出概率,每个元素除以行(真实)
'all' 输出概率,每个元素除以矩阵总和
'''
confusion_matrix(y_true, y_pred, normalize=None)
'''
array([[2, 0, 0],
[0, 0, 1],
[1, 0, 2]])
'''
- 二分类 真阴性(tn)、假阳性(fp)、假阴性(fn)和真阳性(tp)
y_true = [0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [0, 1, 0, 1, 0, 1, 0, 1]
'''
对于二分类
输出真阴性(tn)、假阳性(fp)、假阴性(fn)和真阳性(tp)的计数
'''
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(tn, fp, fn, tp)
4. sklearn.metrics.cohen_kappa_score
4.1 Kappa系数原理
'''
Kappa系数用于一致性检验,也可以用于衡量分类精度;
Kappa系数是一种比例,代表着分类与完全随机的分类产生错误减少的比例;
kappa系数的计算是基于混淆矩阵的。
K = (Po - Pe) / (1 - Pe)
Po即准确率(accuracy),每一类被正确分类的样本数量之和除以总样本数;
Pe = (a1*b1 + a2*b2+...+ac*bc) / n*n
每一类的真实样本个数分别为a1, a2,...,ac,预测的每一类的样本个数分别为b1, b2, ..., bc,总样本数为n;
kappa计算结果为-1~1,但通常落在 0~1 间。
0.0~ 0.20极低的一致性(slight);
0.21~ 0.40一般的一致性(fair);
0.41~ 0.60 中等的一致性(moderate);
0.61~ 0.80 高度的一致性(substantial);
0.81~1几乎完全一致(almost perfect)
'''
4.2 api使用
from sklearn.metrics import cohen_kappa_score
'''
weights
‘linear’ 线性加权
‘quadratic’ 二次加权
None 默认
'''
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
cohen_kappa_score(y_true, y_pred, weights='quadratic')
'''
0.5454545454545454
'''