聚类算法(三)—— 评测方法3(代码)

 

聚类算法相关:

聚类算法(一)——DBSCAN

聚类算法(二)—— 优缺点对比

聚类算法(三)—— 评测方法1

聚类算法(三)—— 评测方法2

聚类算法(三)—— 评测方法3(代码)

聚类算法(四)—— 基于词语相似度的聚类算法(含代码)

聚类算法(五)——层次聚类 linkage (含代码)

聚类算法(六)——谱聚类 (含代码)

代码

 

import pandas as pd
import math

def load_cluster_pred(file_name):
    df = pd.read_excel(file_name, 0)
    word_pred_dic = {}
    cluster_pred_dic = {}
    for rep, word in zip(df['核心词'], df['包含词']):
        if str(rep) == 'nan':
            continue
        if str(rep) == '未分类词语':
            continue
        word_pred_dic[word] = rep
        cluster_pred_dic.setdefault(rep, [])
        cluster_pred_dic[rep].append(word)
    return word_pred_dic, cluster_pred_dic

def load_cluster_label(file_name):
    df = pd.read_excel(file_name)
    columns = df.columns.values.tolist()
    print(columns)
    word_label_dic = {}
    cluster_label_dic = {}
    for i in range(len(columns)):
        keywords = df[columns[i]].tolist()
        for item in keywords:
            if str(item)=='nan':
                continue
            word_label_dic[str(item)] = i
            cluster_label_dic.setdefault(i, [])
            cluster_label_dic[i].append(str(item))
    return word_label_dic, cluster_label_dic

def calculate_sum_aibj(cluster_dic, intersect_keywords):
    sum_aibi = 0
    for cluster_rep in cluster_dic:
        cluster_effect_num = len([x for x in cluster_dic[cluster_rep] if x in intersect_keywords])
        sum_aibi += cluster_effect_num * (cluster_effect_num - 1) / 2
    return sum_aibi

def evaluate(word_label_dic, word_pred_dic, cluster_label_dic, cluster_pred_dic):
    # keywords = list(cluster_label_dic.keys())
    keywords = list(set(word_label_dic.keys()).intersection(word_pred_dic.keys()))
    print('intesection keywords num: {}'.format(len(keywords)))
    sum_ai = calculate_sum_aibj(cluster_label_dic, keywords)
    sum_bj = calculate_sum_aibj(cluster_pred_dic, keywords)
    n = len(keywords)
    a = 0  # w1 w2 in one cluster of label and in one cluster of predict
    b = 0  # w1 w2 in one cluster of label and not in one cluster of predict
    c = 0  # w1 w2 not in one cluster of label and in one cluster of predict
    d = 0  # w1 w2 not in one cluster of label and not in one cluster of predict
    e = 0  # w1 w2 in one cluster of label
    f = 0  # w1 w2 in one cluster of predict
    data = []
    for i in range(n):
        for j in range(i+1, n):
            wi = keywords[i]
            wj = keywords[j]
            tag_label = word_label_dic[wi] == word_label_dic[wj]
            tag_predict = word_pred_dic[wi] == word_pred_dic[wj]
            if tag_label and tag_predict:
                a += 1
            elif tag_label and not tag_predict:
                c += 1
            elif not tag_label and tag_predict:
                b += 1
            else:
                d += 1
            if tag_label:
                e += 1
            if tag_predict:
                f += 1
            data.append([wi, wj, word_label_dic[wi], word_label_dic[wj], word_pred_dic[wi], word_pred_dic[wj], tag_label, tag_predict])
    print("count number: a:{}\t b:{}\t c:{}\t d:{}".format(a, b, c, d))
    print("count number: e:{}\t f:{}".format(e, f))
    ri = ri_eval(a, d, n)
    ari = ari_eval(a, sum_ai, sum_bj, n)
    fmi = fmi_eval(a, b, c)
    jc = jc_eval(a, b, c)
    di = di_eval(a, b, c)

    p = a / (a + b)
    r = a / (a + c)
    print('RI: {}\n ARI: {}\n FMI: {}\n JC: {}\n DI: {}'.format(ri, ari, fmi, jc, di))
    print('precision:{}\t recall:{}\tF-value:{}\n'.format(p, r, 2*p*r/(p+r)))
    df_r = pd.DataFrame(data=data, columns=['wi', 'wj', 'wi_lab_rep', 'wj_lab_rep', 'wi_pred_rep', 'wj_pred_rep', 'samecluster_label', 'samecluster_pred'])
    # df_r.to_excel('/data/hanxuhong/nlp/consultant-nlp/sentiment_new_field/data/美妆_eval.xlsx', index=False)
    return ri

def ri_eval(a, d, n):
    ri = 2 * (a + d) / (n * (n - 1))
    return ri

def ari_eval(a, sum_ai, sum_bj, n):
    molecular = a - (sum_ai * sum_bj) * 2 / (n * (n - 1))
    denominator = (sum_ai + sum_bj) / 2 - 2 * sum_ai * sum_bj / (n * (n - 1))
    ari = molecular / denominator
    return ari

def fmi_eval(a, b, c):
    fmi = math.sqrt((a/(a + b)) * (a/(a + c)))
    return fmi

def jc_eval(a, b, c):
    return a/(a + b + c)

def di_eval(a, b, c):
    return 2 * a /(2 * a + b + c)


def hcv(labels, preds):
    if len(labels)!=len(preds):
        raise Exception('Length error!')
    n = len(labels)




from sklearn import metrics
def eval_2(labels, preds):
    ari = metrics.adjusted_rand_score(labels, preds)
    ami = metrics.adjusted_mutual_info_score(labels, preds)

    homogeneity = metrics.homogeneity_score(labels, preds) # 同质性homogeneity:每个群集只包含单个类的成员。
    completeness = metrics.completeness_score(labels, preds) # 完整性completeness:给定类的所有成员都分配给同一个群集。
    V_measure = metrics.v_measure_score(labels, preds) # 两者的调和平均V-measure
    print('matrix \nARI:{}\nAMI:{}\n'.format(ari, ami))
    print('homogeneity:{}\tcompleteness:{}\tV-measure:{}\n'.format(homogeneity, completeness, V_measure))

转载请注明出处

觉得有用,麻烦点个赞,喜欢我请关注!!!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

微知girl

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值