# -*- coding: utf-8 -*-
"""
Created on Fri Feb 15 14:38:09 2019
@author: yangshijin
"""
import pandas as pd
import numpy as np
import math
import os
import json
# 警告处理
import warnings
warnings.filterwarnings('ignore')
def check_data(x):
"""
内部json列获取指定json字符串
"""
try:
x['data']['RSL'][0]['RS']['desc']
return True
except:
return False
def analysis_inner_json(df, inner_json_col):
"""
内部json解析
return:df_final成功解析之后的json
"""
inner_json = df[inner_json_col]
# 把json字符串转成dict类型--如果这一行报错,可能是这一列数据有特殊符号
dict_col = inner_json.map(lambda x: json.loads(x))
# 取符合条件的行
sr_except = dict_col[dict_col.map(lambda x: check_data(x))]
# 原始数据也同时筛选数据,避免之后合并数据对应不上
df1 = df[dict_col.map(lambda x: check_data(x))].reset_index(drop=True)
# 获取我们需要的dict
dict_col = sr_except.map(lambda x: json.loads(x['data']['RSL'][0]['RS']['desc']))
dict_col_index = list(dict_col)
data_inner = pd.DataFrame(dict_col_index)
# 合并数据,并删除内部json列
df_final = pd.concat([df1, data_inner], axis=1)
del df_final[inner_json_col]
return df_final
def tf_json2(r, json_col):
"""
外部json比较特殊,经常会报错,如果还是有出现特殊符号,则在这个函数需要修改
把特殊符号替换成空或者我们需要的格式
"""
s = r[json_col].lower() if pd.notnull(r[json_col]) else ''
try:
s = '[' + s.replace('};{', '},{') + ']'
s = s.replace('};;{', '},{')
s = s.replace(';{', '{')
s = s.replace('};', '}')
s = s.replace('}gx_err{', '},{')
s = s.replace('}gx_err', '}')
s = s.replace('gx_err{', '{')
s = json.loads(s)
j = {
'data': {
'cg': np.nan
},
'result':
{
'score': np.nan,
'features': np.nan
}
}
for x in s:
j.update(x)
if 'data' in j and 'cg' in j['data']:
data_cg = j['data']['cg']
else:
data_cg = np.nan
if 'result' in j:
if 'score' in j['result']:
result_score = j['result']['score']
else:
result_score = np.nan
if 'features' in j['result']:
result_features = j['result']['features']
else:
result_features = np.nan
else:
result_score = np.nan
result_features = np.nan
sr = pd.Series({
'data_cg': data_cg,
'result_score': result_score,
'result_features': result_features
})
return sr
except:
sr = pd.Series({
'data_cg': s,
'result_score': -999,
'result_features': -999
})
return sr
def analysis_outer_json(df, outer_json_col, text):
"""外部json解析函数
param:df--成功解析内部json之后的dataframe
param:outer_json_col--外部json列名
param:text--作为文件名前缀的文本
return:df--成功解析外部json之后的dataframe
"""
df[['data_cg', 'result_score', 'result_features']] = df.apply(lambda r: tf_json2(r, json_col=outer_json_col), axis=1)
del df[outer_json_col]
# 对解析出来的三列转换类型--经常会有特殊符号,需要替换掉
df = df.fillna(np.nan)
df['GEO_SCORE'] = df['GEO_SCORE'].replace('nan',np.nan).astype(np.int)
df['data_cg'] = df['data_cg'].replace('nan',np.nan)
df['result_score'] = df['result_score'].replace('nan',np.nan)
df['data_cg'] = df['data_cg'].astype(np.str).map(lambda x: x.replace("<", ""))
df['data_cg'] = df['data_cg'].astype(np.str).map(lambda x: x.replace("[gx_err]", '')).astype(np.float)
df['result_score'] = df['result_score'].astype(np.str).map(lambda x: x.replace("<", "")).astype(np.float)
df['result_features'] = df['result_features'].astype(np.str).map(lambda x: x.replace("<", ""))
df.to_csv("./{}_final.csv".format(text), sep='\t', index=False)
return df
def get_desc(base_df, df, apply_time, text):
"""查看描述信息
使用之前确定需要统计的列数据类型为 int 或 float类型
param:base_df--原始数据
param:df--解析json之后的数据
param:apply_time--时间列列名
return:df_desc--描述信息,base_df_cnt--初始数据按天统计查询总量(用来计算匹配率)
目前我还没找到能在DataFrame给出匹配率,之后有新的方法再更新代码
"""
df = df.apply(lambda x:x.replace(-999,np.nan))
# 初始数据按天统计查询总量
base_df_cnt = base_df[apply_time].map(lambda x: x[:8]).value_counts(ascending=True).reset_index()
df[apply_time] = df[apply_time].map(lambda x: x[:8])
# 查看描述信息
percentiles = np.linspace(0.1, 1, 10, endpoint=True).tolist()
df_desc = df.groupby(df[apply_time]).describe(percentiles=percentiles)
df_desc = df_desc.T
df_desc.to_excel("./{}_desc.xlsx".format(text), index=True)
return df_desc,base_df_cnt
def binning(data_res, col_cut_by):
"""分箱查看好坏比"""
a=col_cut_by
test_result_out = data_res[[a, 'y_label']]
test_result_out['score']=[int(i) for i in test_result_out[a]]
#test_result_out['bin']=pd.cut(list(test_result_out['score']),[i for i in range(300,960,30)])
test_result_out['bin']=pd.cut(list(test_result_out['score']),[299,330,360,390,420,450,480,510,540,570,600,630,660,690,720,750,780,810,840,870,900,930])
test_result_out['bin']=test_result_out['bin'].astype(str)
test_result_out['bin']=test_result_out['bin'].replace('nan',-999)
bins=test_result_out.groupby('bin',as_index=False).count()[['bin','y_label']]
bins['y_label'].sum()
bins['rate']=bins['y_label']/bins['y_label'].sum()
bins.columns=['bin', 'count', 'percentage']
bins['percentage']=[round(i,3) for i in bins['percentage']]
bins.to_excel("./"+ a + "bins.xlsx",index=False)
def get_psi(y_train_pred, y_test_pred, bins=list(range(300, 951, 30))):
"""分箱并计算psi
param:y_train_pred、y_test_pred用来计算psi的两列,日志解析时表示为日期
return:psi值、psi分箱统计
"""
#bins_1 = list(range(300, 951, 30)) # GEO_SCORE使用
#bins_2 = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] # 其他的分使用
y_train_pred = y_train_pred.dropna(axis=0,how='any')
y_test_pred = y_test_pred.dropna(axis=0,how='any')
def calculate_psi(x, y):
if pd.isnull(y) or y == 0:
return None
else:
return (x - y) * math.log(x / y)
#y_train_score = y_train_pred.map(to_score)
#y_test_score = y_test_pred.map(to_score)
y_train_bins = pd.cut(y_train_pred, bins=bins, right=False)
y_test_bins = pd.cut(y_test_pred, bins=bins, right=False)
df_psi = pd.DataFrame()
df_psi['count'] = y_test_bins.value_counts()
df_psi['train_bin_pct'] = y_train_bins.value_counts().map(lambda v: v / y_train_bins.shape[0])
df_psi['test_bin_pct'] = y_test_bins.value_counts().map(lambda v: v / y_test_bins.shape[0])
df_psi['psi'] = df_psi.apply(
lambda r: calculate_psi(r['train_bin_pct'], r['test_bin_pct']) if r['train_bin_pct'] > 0 and r[
'test_bin_pct'] > 0 else np.nan, axis=1)
df_psi.index.name = 'bin'
df_psi = df_psi.reset_index()
return df_psi['psi'].sum(), df_psi[['bin', 'count', 'psi']]
def show_psi(df, pivot_index, pivot_columns, pivot_values, ls_date, ls_psi, ls_df_psi, bins=list(range(300, 951, 30))):
"""返回:每天的psi(不包括第一天)、每天的分箱统计(不包括第一天)
param:df
param:pivot_index--pivot索引
param:pivot_columns--pivot列名
param:pivot_values--pivot值
param:ls_date--日期
param:ls_psi--每天的psi
param:ls_df_psi--每天的分箱统计
return:ls_psi->list,ls_df_psi->list
"""
df = df.reset_index()
df_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=pivot_values)
# 计算psi之前先算出第一天的统计
first_day = df_pivot.loc[:,ls_date[0]]
first_day_cnt = pd.cut(first_day, bins=bins, right=False).value_counts().reset_index()
for index,i in enumerate(ls_psi):
ls_psi[index],ls_df_psi[index] = get_psi(df_pivot[ls_date[index]], df_pivot[ls_date[index+1]], bins=bins)
return ls_psi,ls_df_psi,first_day_cnt
if __name__ == '__main__':
bins_1 = list(range(300, 951, 30)) # GEO_SCORE使用
bins_2 = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] # 其他的分使用
# 工作地址根据需要修改
rootpath = "D:/da/yangshijin/log/20190129_zwjf/test"
os.chdir(rootpath)
# 读取数据
zwjf = pd.read_csv('zwjf', sep='\t', header=None, dtype=np.str)
# 定义输出文件名前缀
text = 'zwjf'
# 解析内部json
df = analysis_inner_json(zwjf, 6)
# 解析外部json
df = analysis_outer_json(df, 7, text)
# 查看描述信息
"""一般情况下采用的是以下几列
0--apply_time--申请时间
GEO_SCORE--集奥总分
data_cg--gd
result_score--th
CDDT000--dt
此次用来测试的数据没有CDDT000,所以没有取这一列
由于收到的数据不一定有列名,所以按列取数据的时候,需要注意一下原始数据中apply_time列的索引
"""
use_col = [0,'GEO_SCORE', 'data_cg','result_score']
df = df[use_col]
df.columns = [0,'geo_score', 'gd', 'th']
df_desc,apply_time_cnt = get_desc(zwjf, df, 0, text)
# 计算psi
df[0] = df[0].map(lambda x: x[:8])
ls_date = apply_time_cnt['index'].tolist()
ls_psi = ls_date[1:]
ls_df_psi = ls_date[1:]
#ls_count = [i+'count' for i in ls_date[1:]]
#(20181227,2018122820181229,20190105,20190106,20190107,20190108,20190109,20190110,20190111)
# 如果只是需要查看分箱查看好坏比,则使用binning函数
# 如果还需要查看psi,则使用show_psi
# 返回:每天的psi(不包括第一天)、每天的分箱统计(不包括第一天)--需要第一天的分箱统计需要另外算
ls_psi_geo,ls_df_psi_geo,first_day_cnt = show_psi(df, "index", 0, 'geo_score', ls_date, ls_psi, ls_df_psi, bins=bins_1)