学习了很多博客文章和书本内容,对评价模型的方法有了一点理解,auc不受样本数据均衡性的影响,可以评判模型的稳定性。模型预测出了结果,就能够绘制roc曲线了,不需要F1,F1通常设为roc曲线最接近(0, 1)的点。自己编码绘制了一遍PRC和ROC、AUC,相当于粗糙实现sklearn里面的方法。
# -*- coding:utf-8 -*-
from __future__ import division
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
col_name = ['uid', 'isbad']
stand = pd.read_csv('standard.csv',
header=None, names=col_name, index_col='uid')
answer = pd.read_csv('answer.csv',
header=None, names=col_name, index_col='uid')
total = pd.merge(stand, answer, left_index=True, right_index=True)
def calculate_f1(data_set, actual, predict):
"""
手工计算F1
:param data_set: 包含真实值和预测值的DataFrame
:param actual: 真实值的列名
:param predict: 预测值的列名
:return: F1值
"""
positive = data_set[data_set[actual] == 1].index
# negtive = data_set[data_set[actual] == 0].index
pre_positive = data_set[data_set[predict] == 1].index
# pre_negtive = data_set[data_set[predict] == 0].index
precision = len(pre_positive & positive) / len(pre_positive)
recall = len(pre_positive & positive) / len(positive)
f1 = 2 * (precision * recall) / (precision + recall)
return f1
calculate_f1(total, 'isbad_x', 'isbad_y')
# sklearn求F1
f1_score(total.isbad_x, total.isbad_y)
def calculate_pr(data_set, actual, predict):
"""
手工绘制PR曲线
:param data_set: 包含真实值和预测值的DataFrame
:param actual: 真实值的列名
:param predict: 预测值的列名
:return: precision列表、recall列表和F1列表组成的元组
"""
total.sort_values(by=predict, inplace=True)
precision_list = []
recall_list = []
f1_list = []
sample_cnt = data_set.shape[0]
for i in xrange(0, sample_cnt + 1):
y = [0] * i + [1] * (sample_cnt - i)
tmp = data_set.copy()
tmp[predict] = y
positive = tmp[tmp[actual] == 1].index
# negtive = tmp[tmp[actual] == 0].index
pre_positive = tmp[tmp[predict] == 1].index
# pre_negtive = tmp[tmp[predict] == 0].index
if len(positive) != 0 and len(pre_positive) != 0:
precision = len(pre_positive & positive) / len(pre_positive)
recall = len(pre_positive & positive) / len(positive)
f1 = calculate_f1(tmp, 'isbad_x', 'isbad_y')
precision_list.append(precision)
recall_list.append(recall)
f1_list.append(f1)
return precision_list, recall_list, f1_list
p, r, f1 = calculate_pr(total, 'isbad_x', 'isbad_y')
plt.plot(r, p)
plt.plot(r, f1) # F1 曲线
plt.ylabel('precision')
plt.xlabel('recall')
plt.title('PRC')
def calculate_fpr_tpr(data_set, actual, predict):
"""
手工绘制ROC
:param data_set: 包含真实值和预测值的DataFrame
:param actual: 真实值的列名
:param predict: 预测值的列名
:return: fpr列表和tpr列表组成的元组
"""
total.sort_values(by=predict, inplace=True)
tpr_list = []
fpr_list = []
sample_cnt = data_set.shape[0]
for i in xrange(0, sample_cnt + 1):
y = [0] * i + [1] * (sample_cnt - i)
tmp = data_set.copy()
tmp[predict] = y
positive = tmp[tmp[actual] == 1].index
negtive = tmp[tmp[actual] == 0].index
pre_positive = tmp[tmp[predict] == 1].index
# pre_negtive = tmp[tmp[predict] == 0].index
tpr_ = len(pre_positive & positive) / len(positive)
fpr_ = len(pre_positive & negtive) / len(negtive)
tpr_list.append(tpr_)
fpr_list.append(fpr_)
return fpr_list, tpr_list
fpr, tpr = calculate_fpr_tpr(total, 'isbad_x', 'isbad_y')
plt.plot(fpr, tpr)
plt.ylabel('tpr')
plt.xlabel('fpr')
plt.title('roc')
plt.fill_between(fpr, tpr, alpha=0.3)
plt.text(0.5, 0.5, 'AUC')
# sklearn绘制ROC
fpr, tpr, thresholds = roc_curve(total.isbad_x.values, np.linspace(0, 1, total.shape[0]))
plt.plot(fpr, tpr)
# sklearn求auc值,两个函数都可以
auc_score = auc(fpr, tpr)
auc_score1 = roc_auc_score(total.isbad_x.values, np.linspace(0, 1, total.shape[0]))