目录
dataset
1. Combining transformers and estimators in a pipeline
pass
2. Model Evaluation and Hyperparameter Tuning
confusion matrixm
几个重要的比率
Positive | Negative | |
---|---|---|
Positive | TP | FN |
Negative | FP | TN |
- TPR, true positive rate
recall rate:样本不平衡时,机器预估都一样时正确率高不一定有价值,反而要关注recall rate。 - FPR, false positive rate
- 记忆:TPR&FPR都是看横向;PRE纵向
- ROC曲线
- 陽性預測值 (PPV),越高越好
PPV = TP / (TP + FP)
代码实现
from sklearn.metrics import confusion_matrix
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat) #縱軸0 1;橫軸0 1
- Note:可以自定0 1
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0])
- 将预估出的结果
- 得出报表(precision, recall, f1, support)
- macro avg 整体的
- weight avg 按照加权(考虑样本树不同)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits= 4))
ROC curve实现
若机器有学习到,预测出的值要在roc图45度线左上方
- 画图概念:门槛值提到最高,tpr&fpr从0开始。门槛从右往左移,tpr上升,fpr上升。
from sklearn.metrics import roc_curve, auc
from distutils.version import LooseVersion as Version
from scipy import __version__ as scipy_version
if scipy_version >= Version('1.4.1'):
from numpy import interp
else:
from scipy import interp
pipe_lr = make_pipeline(StandardScaler(),
PCA(n_components=2),
LogisticRegression(penalty='l2',
random_state=1,
solver='lbfgs',
C=100.0))
X_train2 = X_train[:, [4, 14]]#只抓两个解释变数
cv = list(StratifiedKFold(n_splits=3).split(X_train, y_train))
#训练资料分三折,list实体化
fig = plt.figure(figsize=(7, 5))
mean_tpr = 0.0 #平均线。从0开始
mean_fpr = np.linspace(0, 1, 100)#100个点,但门槛只有56个,需内插
all_tpr = []
# 回圈跑三次
for i, (train, test) in enumerate(cv):
probas = pipe_lr.fit(X_train2[train],
y_train[train]).predict_proba(X_train2[test])
fpr, tpr, thresholds = roc_curve(y_train[test],
probas[:, 1],# 机率
pos_label=1) #定哪类是1
mean_tpr += interp(mean_fpr, fpr, tpr)# 求平均数(x必须递增)线性内差
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr) #算面积
plt.plot(fpr,
tpr,
label='ROC fold %d (area = %0.2f)'
% (i+1, roc_auc))
- thresholds: 门槛。预设max+1(确保从1~)
- 分类器学太好
使用SQL计算auc
WITH r1 AS (--先按照score进行分组排序
SELECT score,
count(1) FILTER (WHERE label IS TRUE) AS t,
count(1) FILTER (WHERE label IS FALSE) AS f
FROM score_label
GROUP BY score
ORDER BY score desc)
,r2 AS (--算出大于阈值的总数
SELECT score, t, f,
sum(t) OVER (ORDER BY score desc) AS tsum,--大于等于的t量
sum(f) OVER (ORDER BY score desc) AS fsum--大于等于的f量
FROM r1)
,r3 AS (--第一row加入0当原点,计算每个点对应的TPR和FPR
SELECT case when (SELECT sum(f) FROM r2) = 0 then 0
else f / (SELECT sum(f) FROM r2) end AS width
case when (SELECT sum(t) FROM r2) = 0 then 0
else tsum / (SELECT sum(t) FROM r2) end AS y,--TPR
case when (SELECT sum(f) FROM r2) = 0 then 0
else fsum / (SELECT sum(f) FROM r2) end AS x--FTR
FROM r2
UNION SELECT 0, 0, 0)
,r4 AS (--按照FTR排序(x轴)
SELECT *
FROM r3
ORDER BY x)
,r5 AS (--对y进行积分算面积
SELECT cast(x AS numeric(18, 3)) x,
cast(y AS numeric(18, 3)) y,
(y + lag(y, 1, 0.0) OVER (ORDER BY x, y)) * width / 2 AS area
FROM r4)
SELECT array_agg(x) x, --合并one row
array_agg(y) y,
sum(area) auc
from r5
3. Dealing with class imbalance
- sklearn.utils.class_weight.compute_class_weight
- py套件實作SMOTE方法:a python package to tackle the curse of imbalanced datasets in Machine Learning
以乳癌dataset为例:
# 生成一个极度不均衡的数据
X_imb = np.vstack((X[y == 0], X[y == 1][:40]))#良性345+恶性40=>397*30
y_imb = np.hstack((y[y == 0], y[y == 1][:40]))#(397,)前345=0后40个=1
# 想法:抽样n次(抽完再放回 replace=True)
from sklearn.utils import resample
print('Number of class 1 examples before:', X_imb[y_imb == 1].shape[0])
X_upsampled, y_upsampled = resample(X_imb[y_imb == 1],
y_imb[y_imb == 1],
replace=True,
n_samples=X_imb[y_imb == 0].shape[0],
random_state=123)
print('Number of class 1 examples after:', X_upsampled.shape[0])