Python实现F1 、AUC计算

# -*- coding: utf-8 -*- #
# author NEU_LightBulb-----ZhangJ
import numpy as np
import pandas as pd

class Score():
    def __init__(self,pre_score,rel_label,threshold,beta):
        self.tn = 0
        self.fn = 0
        self.fp = 0
        self.tp = 0
        self.pre_score = pre_score
        self.rel_label = rel_label
        self.threshold = threshold
        self.beta = beta
        list(map(self.__getCM_count,
                 self.pre_score,
                 self.rel_label))

    def __getCM(self,pre, rel):
        if (pre < self.threshold):
            if (rel == 0): return 'TN'
            if (rel == 1): return 'FN'
        if (pre >=  self.threshold):
            if (rel == 0): return 'FP'
            if (rel == 1): return 'TP'

    def get_cm(self):
        return list(map(self.__getCM,
                        self.pre_score,
                        self.rel_label))

    def __getCM_count(self,pre, rel):
        if (pre < self.threshold):
            if (rel == 0): self.tn += 1
            if (rel == 1): self.fn += 1
        if (pre >=  self.threshold):
            if (rel == 0): self.fp += 1
            if (rel == 1): self.tp += 1

    def get_f1(self):
        P = self.tp/(self.tp+self.fp)
        R = self.tp/(self.tp+self.fn)
        if(P == 0.0):
            return 0.0
        else:
            return (self.beta*self.beta+1)*P*R/(self.beta*self.beta*P+R)

    # 方法二 precision——分数精度
    def get_auc_by_count(self,precision=100):
        # 正样本数
        postive_len = sum(self.rel_label)
        # 负样本数
        negative_len = len(self.rel_label) - postive_len
        # 总对比数
        total_case = postive_len * negative_len
        # 正样本分数计数器(填0在range...)
        pos_histogram = [0 for _ in range(precision+1)]
        # 负样本分数计数器(填0在range...)
        neg_histogram = [0 for _ in range(precision+1)]
        # 分数放大
        bin_width = 1.0 / precision

        for i in range(len(self.rel_label)):
            nth_bin = int(self.pre_score[i] / bin_width)
            if self.rel_label[i] == 1:
                pos_histogram[nth_bin] += 1
            else:
                neg_histogram[nth_bin] += 1

        accumulated_neg = 0
        satisfied_pair = 0
        for i in range(precision+1):
            satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)
            accumulated_neg += neg_histogram[i]
        return satisfied_pair / float(total_case)

    # 方法三
    def get_auc_by_rank(self):
        # 拼接排序
        df = pd.DataFrame({'pre_score':self.pre_score,'rel_label':self.rel_label})
        df = df.sort_values(by='pre_score',ascending=False).reset_index(drop=True)
        # 获取 n,N,M
        n = len(df)
        M = len(df[df['rel_label']==1])
        N = n - M
        # 初始化rank 和同值统计ank_tmp,count_all,count_p
        rank = 0.0
        rank_tmp,count_all,count_p = 0.0,0,0
        # 添加防止越界的一条不影响结果的记录
        df.loc[n] = [0,0]
        # 遍历一次
        for i in range(n):
            # 判断i+1是否与i同值,不同值则要考虑是否刚刚结束同值统计
            if(df['pre_score'][i+1] != df['pre_score'][i]):
                # 正样本
                if(df['rel_label'][i] == 1):
                    # 计数不为0,刚刚结束同值统计
                    if (count_all != 0):
                        # 同值统计结果加在rank上,这里注意补回结束统计时漏掉的最后一条同值数据
                        rank += (rank_tmp + n - i) * (count_p+1) / (count_all+1)
                        rank_tmp, count_all, count_p = 0.0, 0, 0
                        continue
                    rank += (n-i)
                else:
                    if (count_all != 0):
                        rank += (rank_tmp + n - i) * (count_p) / (count_all+1)
                        rank_tmp, count_all, count_p = 0.0, 0, 0
                        continue
            else:
                rank_tmp += (n-i)
                count_all += 1
                if(df['rel_label'][i] == 1):
                    count_p += 1
        return (rank-M*(1+M)/2)/(M*N)


if __name__ == '__main__':
     learn_data_L2 = [0.2,0.3,0.4,0.35,0.6,0.55,0.2,0.57,0.3,0.15,0.77,0.33,0.9,0.49, 0.45,0.41, 0.66,0.43,0.7,0.4]
     learn_data_R2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
     learn_data2 = pd.DataFrame({'Learn': learn_data_L2, 'Real': learn_data_R2})

     score2 = Score(learn_data2['Learn'], learn_data2['Real'], 0.5, 1)

     print(score2.get_cm())
     print(score2.get_f1())
     print(score2.get_auc_by_count())
     print(score2.get_auc_by_rank())

 

  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值