机器学习-*-聚类评价指标

直接上代码,参考论文《文本聚类分析效果评价及文本表示研究》

#!/usr/bin/python
# -*- coding:utf-8 -*-
"""
@Name cluster_metrics
@Description
    聚类评估方法
@Date 2019/4/4
"""
import sys
import math

def _find_dict_and_dataset(p_class = [],c_class = []):
    """
    把标注类簇p_class和聚类后的类簇c_class
    转化成{doc:"类标"}形式并得到全部数据集X
    示例输入:[["1","2"],["3","4"]],[["1","3"],["2","4"]]
    :param p_class:
    :param c_class:
    :return: X,p_map,c_map
    """
    X = []
    p_map = {}
    c_map = {}
    for pi in range(len(p_class)):
        for i in range(len(p_class[pi])):
            doc = p_class[pi][i]
            p_map[doc] = "P"+str(pi)
            X.append(doc)

    for ci in range(len(c_class)):
        for i in range(len(c_class[ci])):
            doc = c_class[ci][i]
            c_map[doc] = "C"+str(ci)
    return X,p_map,c_map

def TraditionalIndexRate(p_class = [],c_class = []):
    """
    输出传统统计指标,在文本聚类中运用不多
    示例输入:[["1","2"],["3","4"]],[["1","3"],["2","4"]]
    :param p_class:
    :param c_class:
    :return: R,J,FM,PA,NA,AA
    """
    if len(p_class) == 0 or len(c_class) == 0 :
        print("输入的历史数据标签是空")
        sys.exit(1)
    X,p_map,c_map = _find_dict_and_dataset(p_class,c_class)
    SS = 0
    SD = 0
    DS = 0
    DD = 0
    #遍历数据,根据相等关系 计算SS、SD、DS、DD
    for xi in range(len(X)-1):
        for xj in range(xi+1,len(X)):
               if c_map.get(X[xi]) == c_map.get(X[xj]):
                   if p_map.get(X[xi]) == p_map.get(X[xj]):
                       SS += 1
                   else:
                       SD += 1
               else:
                   if p_map.get(X[xi]) == p_map.get(X[xj]):
                       DS += 1
                   else:
                       DD += 1
    M = SS + DS + SD + DD
    #输出一些重要指标(传统聚类统计指标)
    R = (SS + DD) * 1.0 / M
    J = SS * 1.0 / (SS + SD + DS)
    FM = math.sqrt((SS*1.0/(SS+SD))*(SS*1.0/(SS+DS)))
    PA = SS * 1.0 / (SS + DS)
    NA = DD * 1.0 / (DD + SD)
    AA = (PA + NA) / 2
    return R,J,FM,PA,NA,AA

def BasedOnManualAnnotationIndexRate(p_class = [],c_class = []):
    """
    基于人工标注类的准确率、召回率、F值进行计算,返回Class_F值,是一个整体指标,推荐使用
    示例输入:[["1","2"],["3","4"]],[["1","3"],["2","4"]]
    :param p_class:
    :param c_class:
    :return: Class_F
    """
    FP = []
    for j in range(len(p_class)):
        Pj = set(p_class[j])
        FPj = []
        for i in range(len(c_class)):
            Ci = set(c_class[i])
            pre_ji = len(Pj & Ci) * 1.0 / len(Ci)
            rec_ji = len(Pj & Ci) * 1.0 / len(Pj)
            Fji = 0.0
            if pre_ji + rec_ji != 0:
                Fji = 2 * pre_ji * rec_ji / (pre_ji + rec_ji)
            FPj.append(Fji)
        FP.append(max(FPj))
    P = 0.0
    PFP = 0.0
    for j in range(len(p_class)):
        Pj_len = len(p_class[j])
        P += Pj_len
        PFP += Pj_len * FP[j]
    Class_F = PFP /  P
    return Class_F

def BasedOnClusterIndexRate():
    """
    基于簇的准确率、召回率及F值,较为不常用,这里等待以后实现
    """
    pass

def _find_doc_index(doc="",mmap={},C=""):
    """
    发现文档在类别中的索引值
    :param doc:
    :param mmap:
    :param C:
    :return:
    """
    label = mmap.get(doc)
    num = int(label.replace(C,""))
    return num

def BasedOnDocIndexRate(p_class = [],c_class = []):
    """
    基于文档的准确率及召回率及F值,最后的结果为全部文档的平均值,该值与Class_F想类似,可以作为文本聚类效果的评价
    示例输入:[["1","2"],["3","4"]],[["1","3"],["2","4"]]
    :param p_class:
    :param c_class:
    :return: P,R,F
    """
    X, p_map, c_map = _find_dict_and_dataset(p_class, c_class)
    Precision = []
    Recall = []
    for doc in X:
        p_index = _find_doc_index(doc,p_map,"P")
        c_index = _find_doc_index(doc, c_map, "C")
        S_correct = set(p_class[p_index])
        S_compute = set(c_class[c_index])
        p = len(S_correct & S_compute) * 1.0 / len(S_compute)
        r = len(S_correct & S_compute) * 1.0 /len(S_correct)
        Precision.append(p)
        Recall.append(r)
    P_aver = sum(Precision) / len(Precision)
    R_aver = sum(Recall) / len(Recall)
    F = 2 * P_aver * R_aver / (P_aver + R_aver)
    return P_aver,R_aver,F

def EntropyIndexRate(p_class = [],c_class = []):
    """
    计算聚类结果的熵大小-推荐使用的指标
    但是熵倾向于小类别,所以需要与Class_F等方法一起使用
    示例输入:[["1","2"],["3","4"]],[["1","3"],["2","4"]]
    输出为 c_class 长度的双精度值列表
    :param p_class:
    :param c_class:
    :return: E
    """
    E = []
    S = len(p_class)
    for i in range(len(c_class)):
        Ci = set(c_class[i])
        Ei = 0.0
        for j in range(len(p_class)):
            Pj = set(p_class[j])
            Uj = Ci & Pj
            ULog = 1.0
            if len(Uj) != 0:
                ULog = math.log(len(Ci) * 1.0 / len(Uj))
            Ei += (len(Uj) * 1.0 / len(Ci)) * ULog
        E.append(Ei / S)
    return E,math.fsum(E)/len(E)
  • 1
    点赞
  • 1
    收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
©️2022 CSDN 皮肤主题:像素格子 设计师:CSDN官方博客 返回首页
评论

打赏作者

Leo蓝色

各位老大 可怜可怜~

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值