聚类算法（三）—— 评测方法3(代码）

微知girl

于 2020-09-03 11:26:10 发布

阅读量820

点赞数

分类专栏： # 聚类算法 NLP 文章标签：自然语言处理聚类算法机器学习 python

本文链接：https://blog.csdn.net/katrina1rani/article/details/108379134

版权

NLP 同时被 2 个专栏收录

50 篇文章 13 订阅

订阅专栏

聚类算法

9 篇文章 11 订阅

订阅专栏

聚类算法相关：

聚类算法（四）—— 基于词语相似度的聚类算法（含代码）

聚类算法（五）——层次聚类 linkage （含代码）

聚类算法（六）——谱聚类（含代码）

代码

import pandas as pd
import math

def load_cluster_pred(file_name):
    df = pd.read_excel(file_name, 0)
    word_pred_dic = {}
    cluster_pred_dic = {}
    for rep, word in zip(df['核心词'], df['包含词']):
        if str(rep) == 'nan':
            continue
        if str(rep) == '未分类词语':
            continue
        word_pred_dic[word] = rep
        cluster_pred_dic.setdefault(rep, [])
        cluster_pred_dic[rep].append(word)
    return word_pred_dic, cluster_pred_dic

def load_cluster_label(file_name):
    df = pd.read_excel(file_name)
    columns = df.columns.values.tolist()
    print(columns)
    word_label_dic = {}
    cluster_label_dic = {}
    for i in range(len(columns)):
        keywords = df[columns[i]].tolist()
        for item in keywords:
            if str(item)=='nan':
                continue
            word_label_dic[str(item)] = i
            cluster_label_dic.setdefault(i, [])
            cluster_label_dic[i].append(str(item))
    return word_label_dic, cluster_label_dic

def calculate_sum_aibj(cluster_dic, intersect_keywords):
    sum_aibi = 0
    for cluster_rep in cluster_dic:
        cluster_effect_num = len([x for x in cluster_dic[cluster_rep] if x in intersect_keywords])
        sum_aibi += cluster_effect_num * (cluster_effect_num - 1) / 2
    return sum_aibi

def evaluate(word_label_dic, word_pred_dic, cluster_label_dic, cluster_pred_dic):
    # keywords = list(cluster_label_dic.keys())
    keywords = list(set(word_label_dic.keys()).intersection(word_pred_dic.keys()))
    print('intesection keywords num: {}'.format(len(keywords)))
    sum_ai = calculate_sum_aibj(cluster_label_dic, keywords)
    sum_bj = calculate_sum_aibj(cluster_pred_dic, keywords)
    n = len(keywords)
    a = 0  # w1 w2 in one cluster of label and in one cluster of predict
    b = 0  # w1 w2 in one cluster of label and not in one cluster of predict
    c = 0  # w1 w2 not in one cluster of label and in one cluster of predict
    d = 0  # w1 w2 not in one cluster of label and not in one cluster of predict
    e = 0  # w1 w2 in one cluster of label
    f = 0  # w1 w2 in one cluster of predict
    data = []
    for i in range(n):
        for j in range(i+1, n):
            wi = keywords[i]
            wj = keywords[j]
            tag_label = word_label_dic[wi] == word_label_dic[wj]
            tag_predict = word_pred_dic[wi] == word_pred_dic[wj]
            if tag_label and tag_predict:
                a += 1
            elif tag_label and not tag_predict:
                c += 1
            elif not tag_label and tag_predict:
                b += 1
            else:
                d += 1
            if tag_label:
                e += 1
            if tag_predict:
                f += 1
            data.append([wi, wj, word_label_dic[wi], word_label_dic[wj], word_pred_dic[wi], word_pred_dic[wj], tag_label, tag_predict])
    print("count number: a:{}\t b:{}\t c:{}\t d:{}".format(a, b, c, d))
    print("count number: e:{}\t f:{}".format(e, f))
    ri = ri_eval(a, d, n)
    ari = ari_eval(a, sum_ai, sum_bj, n)
    fmi = fmi_eval(a, b, c)
    jc = jc_eval(a, b, c)
    di = di_eval(a, b, c)

    p = a / (a + b)
    r = a / (a + c)
    print('RI: {}\n ARI: {}\n FMI: {}\n JC: {}\n DI: {}'.format(ri, ari, fmi, jc, di))
    print('precision：{}\t recall：{}\tF-value:{}\n'.format(p, r, 2*p*r/(p+r)))
    df_r = pd.DataFrame(data=data, columns=['wi', 'wj', 'wi_lab_rep', 'wj_lab_rep', 'wi_pred_rep', 'wj_pred_rep', 'samecluster_label', 'samecluster_pred'])
    # df_r.to_excel('/data/hanxuhong/nlp/consultant-nlp/sentiment_new_field/data/美妆_eval.xlsx', index=False)
    return ri

def ri_eval(a, d, n):
    ri = 2 * (a + d) / (n * (n - 1))
    return ri

def ari_eval(a, sum_ai, sum_bj, n):
    molecular = a - (sum_ai * sum_bj) * 2 / (n * (n - 1))
    denominator = (sum_ai + sum_bj) / 2 - 2 * sum_ai * sum_bj / (n * (n - 1))
    ari = molecular / denominator
    return ari

def fmi_eval(a, b, c):
    fmi = math.sqrt((a/(a + b)) * (a/(a + c)))
    return fmi

def jc_eval(a, b, c):
    return a/(a + b + c)

def di_eval(a, b, c):
    return 2 * a /(2 * a + b + c)


def hcv(labels, preds):
    if len(labels)!=len(preds):
        raise Exception('Length error!')
    n = len(labels)




from sklearn import metrics
def eval_2(labels, preds):
    ari = metrics.adjusted_rand_score(labels, preds)
    ami = metrics.adjusted_mutual_info_score(labels, preds)

    homogeneity = metrics.homogeneity_score(labels, preds) # 同质性homogeneity：每个群集只包含单个类的成员。
    completeness = metrics.completeness_score(labels, preds) # 完整性completeness：给定类的所有成员都分配给同一个群集。
    V_measure = metrics.v_measure_score(labels, preds) # 两者的调和平均V-measure
    print('matrix \nARI：{}\nAMI:{}\n'.format(ari, ami))
    print('homogeneity：{}\tcompleteness:{}\tV-measure:{}\n'.format(homogeneity, completeness, V_measure))

转载请注明出处

觉得有用，麻烦点个赞，喜欢我请关注！！！

微知girl

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
聚类算法（三）—— 评测方法3(代码）

聚类算法（一）——DBSCAN聚类算法（二）—— 优缺点对比聚类算法（四）—— 基于词语相似度的聚类算法聚类算法（三）—— 评测方法1聚类算法（三）—— 评测方法2import pandas as pdimport mathdef load_cluster_pred(file_name): df = pd.read_excel(file_name, 0) word_pred_dic = {} cluster_pred_dic = {} for re
复制链接

扫一扫