聚类算法相关:
代码
import pandas as pd
import math
def load_cluster_pred(file_name):
df = pd.read_excel(file_name, 0)
word_pred_dic = {}
cluster_pred_dic = {}
for rep, word in zip(df['核心词'], df['包含词']):
if str(rep) == 'nan':
continue
if str(rep) == '未分类词语':
continue
word_pred_dic[word] = rep
cluster_pred_dic.setdefault(rep, [])
cluster_pred_dic[rep].append(word)
return word_pred_dic, cluster_pred_dic
def load_cluster_label(file_name):
df = pd.read_excel(file_name)
columns = df.columns.values.tolist()
print(columns)
word_label_dic = {}
cluster_label_dic = {}
for i in range(len(columns)):
keywords = df[columns[i]].tolist()
for item in keywords:
if str(item)=='nan':
continue
word_label_dic[str(item)] = i
cluster_label_dic.setdefault(i, [])
cluster_label_dic[i].append(str(item))
return word_label_dic, cluster_label_dic
def calculate_sum_aibj(cluster_dic, intersect_keywords):
sum_aibi = 0
for cluster_rep in cluster_dic:
cluster_effect_num = len([x for x in cluster_dic[cluster_rep] if x in intersect_keywords])
sum_aibi += cluster_effect_num * (cluster_effect_num - 1) / 2
return sum_aibi
def evaluate(word_label_dic, word_pred_dic, cluster_label_dic, cluster_pred_dic):
# keywords = list(cluster_label_dic.keys())
keywords = list(set(word_label_dic.keys()).intersection(word_pred_dic.keys()))
print('intesection keywords num: {}'.format(len(keywords)))
sum_ai = calculate_sum_aibj(cluster_label_dic, keywords)
sum_bj = calculate_sum_aibj(cluster_pred_dic, keywords)
n = len(keywords)
a = 0 # w1 w2 in one cluster of label and in one cluster of predict
b = 0 # w1 w2 in one cluster of label and not in one cluster of predict
c = 0 # w1 w2 not in one cluster of label and in one cluster of predict
d = 0 # w1 w2 not in one cluster of label and not in one cluster of predict
e = 0 # w1 w2 in one cluster of label
f = 0 # w1 w2 in one cluster of predict
data = []
for i in range(n):
for j in range(i+1, n):
wi = keywords[i]
wj = keywords[j]
tag_label = word_label_dic[wi] == word_label_dic[wj]
tag_predict = word_pred_dic[wi] == word_pred_dic[wj]
if tag_label and tag_predict:
a += 1
elif tag_label and not tag_predict:
c += 1
elif not tag_label and tag_predict:
b += 1
else:
d += 1
if tag_label:
e += 1
if tag_predict:
f += 1
data.append([wi, wj, word_label_dic[wi], word_label_dic[wj], word_pred_dic[wi], word_pred_dic[wj], tag_label, tag_predict])
print("count number: a:{}\t b:{}\t c:{}\t d:{}".format(a, b, c, d))
print("count number: e:{}\t f:{}".format(e, f))
ri = ri_eval(a, d, n)
ari = ari_eval(a, sum_ai, sum_bj, n)
fmi = fmi_eval(a, b, c)
jc = jc_eval(a, b, c)
di = di_eval(a, b, c)
p = a / (a + b)
r = a / (a + c)
print('RI: {}\n ARI: {}\n FMI: {}\n JC: {}\n DI: {}'.format(ri, ari, fmi, jc, di))
print('precision:{}\t recall:{}\tF-value:{}\n'.format(p, r, 2*p*r/(p+r)))
df_r = pd.DataFrame(data=data, columns=['wi', 'wj', 'wi_lab_rep', 'wj_lab_rep', 'wi_pred_rep', 'wj_pred_rep', 'samecluster_label', 'samecluster_pred'])
# df_r.to_excel('/data/hanxuhong/nlp/consultant-nlp/sentiment_new_field/data/美妆_eval.xlsx', index=False)
return ri
def ri_eval(a, d, n):
ri = 2 * (a + d) / (n * (n - 1))
return ri
def ari_eval(a, sum_ai, sum_bj, n):
molecular = a - (sum_ai * sum_bj) * 2 / (n * (n - 1))
denominator = (sum_ai + sum_bj) / 2 - 2 * sum_ai * sum_bj / (n * (n - 1))
ari = molecular / denominator
return ari
def fmi_eval(a, b, c):
fmi = math.sqrt((a/(a + b)) * (a/(a + c)))
return fmi
def jc_eval(a, b, c):
return a/(a + b + c)
def di_eval(a, b, c):
return 2 * a /(2 * a + b + c)
def hcv(labels, preds):
if len(labels)!=len(preds):
raise Exception('Length error!')
n = len(labels)
from sklearn import metrics
def eval_2(labels, preds):
ari = metrics.adjusted_rand_score(labels, preds)
ami = metrics.adjusted_mutual_info_score(labels, preds)
homogeneity = metrics.homogeneity_score(labels, preds) # 同质性homogeneity:每个群集只包含单个类的成员。
completeness = metrics.completeness_score(labels, preds) # 完整性completeness:给定类的所有成员都分配给同一个群集。
V_measure = metrics.v_measure_score(labels, preds) # 两者的调和平均V-measure
print('matrix \nARI:{}\nAMI:{}\n'.format(ari, ami))
print('homogeneity:{}\tcompleteness:{}\tV-measure:{}\n'.format(homogeneity, completeness, V_measure))
转载请注明出处
觉得有用,麻烦点个赞,喜欢我请关注!!!