Code
import numpy as np
import pandas as pd
def js_divergence(p, q):
p = np.array(p)
q = np.array(q)
m = (p + q) / 2
js = 0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m))
return round(float(js), 6)
def get_exp(pred, actual, pred_sum, actual_sum):
if actual_sum - pred_sum == 0:
return actual - pred
else:
return (actual - pred) / (actual_sum - pred_sum)
def root_cause_analysis_for_single_dimension(dframe, dimension_2_check):
EP_threshold = 0.1
# 先计算isp维度的
group_isp = dframe.groupby(dimension_2_check).sum().reset_index()
group_isp['a_sum'] = dframe['a'].sum()
group_isp['f_sum'] = dframe['f'].sum()
group_isp['q'] = group_isp['a'] / group_isp['a_sum']
group_isp['p'] = group_isp['f'] / group_isp['f_sum']
# 第一步:计算当前维度下Surprise
group_isp['surprise'] = group_isp[['p', 'q']].apply(lambda x: js_divergence(x['p'], x['q']), axis=1)
# 第二步:计算当前维度下每个元素值的EP值
group_isp['EP'] = group_isp[['f', 'a', 'f_sum', 'a_sum']].apply(lambda x: get_exp(x['f'], x['a'], x['f_sum'], x['a_sum']), axis=1)
isp_surprise = group_isp['surprise'].sum()
# 当EP值大于TEEP阈值时,认为是可疑元素,加入该维度下的根因集合。
# TEEP阈值的设定反映了不再考虑解释能力差,变化占比小的元素,使结果尽可能简洁
group_isp = group_isp[group_isp['EP'] > EP_threshold]
return isp_surprise, group_isp
i = 1
D_C = []
# 单维度分析
def mult_axis_analysis(df, axis_2_check):
global i
if len(axis_2_check) == 0:
return
df_total = []
dimension_list = []
print(f"layer: {i}, we will check these axis: {axis_2_check}")
for a_to_c in axis_2_check:
result = df[[a_to_c, 'a', 'f']]
result.columns = ['elements', 'a', 'f']
suprise_sum, group_isp = root_cause_analysis_for_single_dimension(result, ['elements'])
group_isp['suprise_sum'] = suprise_sum
group_isp['dimension'] = a_to_c
df_total.append(group_isp)
if suprise_sum > 0:
dimension_list.append([a_to_c, suprise_sum])
df_final = pd.concat(df_total, axis=0)
df_final = df_final.sort_values(by=['suprise_sum', 'EP'], ascending=False)
dimension = sorted(dimension_list, key=lambda x: x[1], reverse=True)[0][0]
dimension_filter = df_final[df_final['dimension'] == dimension]
dimension_value = dimension_filter['elements'].to_list()[0]
D_C.append(dimension_value)
print(f"layer: {i}, dimension:{dimension}, dimension_value:{dimension_value}")
print(dimension_filter[['elements', 'a', 'f', 'q', 'p', 'surprise', 'EP', 'suprise_sum']])
i += 1
#找出第一个维度后,进行维度剥离, 进行下一层的递归搜索
axis_2_check.remove(dimension)
new_columns = axis_2_check + ['a', 'f']
df = df[df[dimension] == dimension_value][new_columns]
mult_axis_analysis(df, axis_2_check)
if __name__ == "__main__":
lists = [['联通', '北京', 20, 5],
['联通', '上海', 15, 15],
['联通', '广东', 10, 10],
['电信', '北京', 10, 7],
['电信', '上海', 25, 15],
['电信', '广东', 20, 20]]
df = pd.DataFrame(lists, columns=['isp', 'area', 'a', 'f'])
mult_axis_analysis(df, ['isp', 'area'])
print("D_C:", D_C)
吃水不忘挖井人
https://developer.aliyun.com/article/849247
多维智能下钻分析–Adtributor算法研究
Adtributor python实现