iv值,MIC值,KS值,相关系数,皮尔森相关系数计算方法。
# -*- coding: utf-8 -*-
# @Time : 2019/2/14 下午4:19
# @Author : yangchen
# @FileName: IV.py
# @Software: PyCharm
# @Blog :https://blog.csdn.net/opp003/article
import numpy
import pandas as pd
import pandas
import scipy
from minepy import MINE
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
def single_threshold(data, factor_name):
"""
Explanation
----------
计算单一阈值(常规小于0.95以上,具体参考模型要求)
Remark
(1) 适用于连续性指标、离散有序型指标、离散无序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 缺失值也参与计算
"""
# percent = data[factor_name].value_counts(normalize=True, dropna=False)
percent = data[factor_name].value_counts(dropna=False).reset_index().iloc[0, 1] / float(len(data))
return percent.max()
def pearson_p(data, flag_name, factor_name):
"""
Explanation
----------
相关性检验(相关系数,通常大于0.5,属于强相关,0.3~0.5相关,0.1~0.3弱相关),检验的p值小于0.05
Remark
(1) 适用于连续性指标、离散有序型指标、离散无序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
pearson = pearsonr(data[flag_name], data[factor_name])
return pearson[1]
def get_str(x):
# repr用法
if type(x) in [float, numpy.float64, numpy.float16, numpy.float32]:
return ('{0:.17}'.format(x))
elif type(x) in [int, numpy.int8, numpy.int16, numpy.int32, numpy.int64]:
return str(x)
else:
try:
return str(x)
except:
return x
def group_by_df(data, flag_name, factor_name, bad_name, good_name, discrete_list=[]):
"""method"s help string.
Explanation
----------
转换数据格式,变为指标每一种取值、每一种指标取值下好样本的个数、坏样本的个数
Parameters
----------
data: dataframe
原始样本
flag_name: string
标签名称
factor_name: string
指标名称
bad_name: string
坏样本个数列名
good_name: string
好样本个数列名
discrete_list: list,default=[]
指定离散指标的名称list
Return
------
data1: dataframe, 指标每一种取值、每一种指标取值下好样本的个数、坏样本的个数
"""
if len(data) == 0:
return pandas.DataFrame()
data1 = data[flag_name].groupby([data[factor_name], data[flag_name]]).count()
data1 = data1.unstack()
data1 = data1.reset_index()
data1 = data1.fillna(0)
if len(data1.columns) == 3:
data1.columns = [factor_name, good_name, bad_name]
data1['%Bad_Rate'] = map(lambda x, y: x / float(x + y), data1[bad_name], data1[good_name])
try:
if factor_name not in discrete_list:
data1[factor_name] = data1[factor_name].astype(float)
data1 = data1.sort_values(by=[factor_name], ascending=True)
data1[factor_name] = map(lambda x: get_str(x), data1[factor_name])
data1['Char_Type'] = 'numeric'
else:
data1 = data1.sort_values(by=['%Bad_Rate'], ascending=True)
data1['Char_Type'] = 'non-numeric'
except:
data1 = data1.sort_values(by=['%Bad_Rate'], ascending=True)
data1['Char_Type'] = 'non-numeric'
data1 = data1.reset_index(drop=True)
return data1
else:
return pandas.DataFrame()
def correlation(data, flag_name, factor_name, corr_method):
"""
Explanation
----------
计算相关系数(相关系数,通常大于0.5,属于强相关,0.3~0.5相关,0.1~0.3弱相关),结合t检验来看
corr_method可定义{'pearson','kendall','spearman'}
Remark
(1) 适用于连续性指标、离散有序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
try:
data[factor_name] = data[factor_name].astype(float)
return data[[flag_name, factor_name]].corr(method=corr_method)[flag_name][factor_name]
except:
return None
def KS(data, flag_name, factor_name, discrete_list):
"""
Explanation
----------
计算KS,以及KS对应的指标取值
Remark
(1) 适用于连续性指标、离散有序型指标、离散无序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
data = data[data[factor_name].notnull()]
data1 = group_by_df(data=data, flag_name=flag_name, factor_name=factor_name, bad_name='bad', good_name='good',
discrete_list=discrete_list)
data1['default_CDF'] = numpy.cumsum(data1['bad'])
data1['undefault_CDF'] = numpy.cumsum(data1['good'])
bad_total = sum(data1['bad'])
good_total = sum(data1['good'])
data1['default_CDF_ratio'] = map(lambda x: round(x / bad_total, 20), data1['default_CDF'])
data1['undefault_CDF_ratio'] = map(lambda x: round(x / good_total, 20), data1['undefault_CDF'])
ks_CDF = abs(data1['default_CDF_ratio'] - data1['undefault_CDF_ratio'])
ks = max(ks_CDF)
index = filter(lambda x: ks_CDF.loc[x] == ks, ks_CDF.index)[0] # 最大值ks时的值切分点
value = data1[factor_name][index]
return ks, value
def MIC(data, flag_name, factor_name):
"""
Explanation
----------
计算MIC
Remark
(1) 适用于连续性指标、离散有序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
data = data[data[factor_name].notnull()]
mine = MINE(alpha=0.6, c=15)
mine.compute_score(data[factor_name], data[flag_name])
m = mine.mic()
return m
def AR(data, flag_name, factor_name):
"""
Explanation
----------
计算AR、gini
Remark
(1) 适用于连续性指标、离散有序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
try:
data[factor_name].astype(float)
data = data[data[factor_name].notnull()]
auc = roc_auc_score(data[flag_name], data[factor_name])
ar = 2 * auc - 1
return ar
except:
return None
def pearson_chi2(data, flag_name, factor_name, discrete_list):
"""
Explanation
----------
计算皮尔逊卡方检验的p-value,服从 x2(len(指标取值类别)-1) 分布
Remark
(1) 适用于离散有序型指标、离散无序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
data = data[data[factor_name].notnull()]
data1 = group_by_df(data, flag_name, factor_name, 'bad', 'good', discrete_list=discrete_list)
good = data1['good'].sum()
bad = data1['bad'].sum()
n = good + bad
data1['good_new'] = map(lambda x, y: ((x - good * (x + y) / float(n)) ** 2) / (good * (x + y) / float(n)),
data1['good'], data1['bad'])
data1['bad_new'] = map(lambda x, y: ((y - bad * (x + y) / float(n)) ** 2) / (bad * (x + y) / float(n)),
data1['good'], data1['bad'])
chi = data1['good_new'].sum() + data1['bad_new'].sum()
return scipy.stats.chi2.sf(chi, len(data1) - 1)
def IV(data, flag_name, factor_name, discrete_list):
"""
Explanation
----------
计算IV
Remark
(1) 适用于连续性指标、离散有序型指标、离散无序型指标
(2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
(3) 如果指标中存在缺失值,剔除缺失值计算
"""
data = data[data[factor_name].notnull()]
data1 = group_by_df(data, flag_name, factor_name, 'bad', 'good', discrete_list=discrete_list)
good = float(sum(data1['good']))
bad = float(sum(data1['bad']))
data1['iv'] = list(map(lambda x, y: (x / good - y / bad) * numpy.log(x * bad / (good * y)) if x * y != 0 else 0,
data1['good'], data1['bad']))
return data1['iv'].sum()
def main():
df = pd.read_csv('/Users/yangchen/Desktop/loan_data/wx_daiqian_label.csv')
X = df.drop(['ENCODE_CERT_CODE', 'ENCODE_MOBILE_PHONE', 'province', 'LOAN_START_DATE'], axis=1)
cols = X.columns
result_list = {}
for col in cols:
value = IV(X, 'label', col,[])
result_list[col] = value
# print(col + ' : ' + str(value))
dic_sort = sorted(result_list.items(), key=lambda item: item[0], reverse=True)
print(dic_sort)
if __name__ == '__main__':
main()