数据分析:各种方法

iv值,MIC值,KS值,相关系数,皮尔森相关系数计算方法。

# -*- coding: utf-8 -*-
# @Time    : 2019/2/14 下午4:19
# @Author  : yangchen
# @FileName: IV.py
# @Software: PyCharm
# @Blog    :https://blog.csdn.net/opp003/article

import numpy
import pandas as pd
import pandas
import scipy
from minepy import MINE
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr

def single_threshold(data, factor_name):
    """
    Explanation
    ----------
    计算单一阈值(常规小于0.95以上,具体参考模型要求)

    Remark
    (1) 适用于连续性指标、离散有序型指标、离散无序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 缺失值也参与计算
    """
    # percent = data[factor_name].value_counts(normalize=True, dropna=False)
    percent = data[factor_name].value_counts(dropna=False).reset_index().iloc[0, 1] / float(len(data))
    return percent.max()


def pearson_p(data, flag_name, factor_name):
    """
    Explanation
    ----------
    相关性检验(相关系数,通常大于0.5,属于强相关,0.3~0.5相关,0.1~0.3弱相关),检验的p值小于0.05

    Remark
    (1) 适用于连续性指标、离散有序型指标、离散无序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    pearson = pearsonr(data[flag_name], data[factor_name])
    return pearson[1]


def get_str(x):
    # repr用法
    if type(x) in [float, numpy.float64, numpy.float16, numpy.float32]:
        return ('{0:.17}'.format(x))
    elif type(x) in [int, numpy.int8, numpy.int16, numpy.int32, numpy.int64]:
        return str(x)
    else:
        try:
            return str(x)
        except:
            return x


def group_by_df(data, flag_name, factor_name, bad_name, good_name, discrete_list=[]):
    """method"s help string.

    Explanation
    ----------
    转换数据格式,变为指标每一种取值、每一种指标取值下好样本的个数、坏样本的个数

    Parameters
    ----------
    data: dataframe
        原始样本

    flag_name: string
        标签名称

    factor_name: string
        指标名称

    bad_name: string
        坏样本个数列名

    good_name: string
        好样本个数列名

    discrete_list: list,default=[]
        指定离散指标的名称list

    Return
    ------
    data1: dataframe, 指标每一种取值、每一种指标取值下好样本的个数、坏样本的个数
    """
    if len(data) == 0:
        return pandas.DataFrame()
    data1 = data[flag_name].groupby([data[factor_name], data[flag_name]]).count()
    data1 = data1.unstack()
    data1 = data1.reset_index()
    data1 = data1.fillna(0)
    if len(data1.columns) == 3:
        data1.columns = [factor_name, good_name, bad_name]
        data1['%Bad_Rate'] = map(lambda x, y: x / float(x + y), data1[bad_name], data1[good_name])
        try:
            if factor_name not in discrete_list:
                data1[factor_name] = data1[factor_name].astype(float)
                data1 = data1.sort_values(by=[factor_name], ascending=True)
                data1[factor_name] = map(lambda x: get_str(x), data1[factor_name])
                data1['Char_Type'] = 'numeric'
            else:
                data1 = data1.sort_values(by=['%Bad_Rate'], ascending=True)
                data1['Char_Type'] = 'non-numeric'

        except:
            data1 = data1.sort_values(by=['%Bad_Rate'], ascending=True)
            data1['Char_Type'] = 'non-numeric'
        data1 = data1.reset_index(drop=True)
        return data1
    else:
        return pandas.DataFrame()


def correlation(data, flag_name, factor_name, corr_method):
    """
    Explanation
    ----------
    计算相关系数(相关系数,通常大于0.5,属于强相关,0.3~0.5相关,0.1~0.3弱相关),结合t检验来看
 corr_method可定义{'pearson','kendall','spearman'}

    Remark
    (1) 适用于连续性指标、离散有序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    try:
        data[factor_name] = data[factor_name].astype(float)
        return data[[flag_name, factor_name]].corr(method=corr_method)[flag_name][factor_name]
    except:
        return None


def KS(data, flag_name, factor_name, discrete_list):
    """
    Explanation
    ----------
    计算KS,以及KS对应的指标取值

    Remark
    (1) 适用于连续性指标、离散有序型指标、离散无序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    data = data[data[factor_name].notnull()]
    data1 = group_by_df(data=data, flag_name=flag_name, factor_name=factor_name, bad_name='bad', good_name='good',
                        discrete_list=discrete_list)
    data1['default_CDF'] = numpy.cumsum(data1['bad'])
    data1['undefault_CDF'] = numpy.cumsum(data1['good'])
    bad_total = sum(data1['bad'])
    good_total = sum(data1['good'])
    data1['default_CDF_ratio'] = map(lambda x: round(x / bad_total, 20), data1['default_CDF'])
    data1['undefault_CDF_ratio'] = map(lambda x: round(x / good_total, 20), data1['undefault_CDF'])
    ks_CDF = abs(data1['default_CDF_ratio'] - data1['undefault_CDF_ratio'])
    ks = max(ks_CDF)
    index = filter(lambda x: ks_CDF.loc[x] == ks, ks_CDF.index)[0]  # 最大值ks时的值切分点
    value = data1[factor_name][index]
    return ks, value


def MIC(data, flag_name, factor_name):
    """
    Explanation
    ----------
    计算MIC

    Remark
    (1) 适用于连续性指标、离散有序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    data = data[data[factor_name].notnull()]
    mine = MINE(alpha=0.6, c=15)
    mine.compute_score(data[factor_name], data[flag_name])
    m = mine.mic()
    return m


def AR(data, flag_name, factor_name):
    """
    Explanation
    ----------
    计算AR、gini

    Remark
    (1) 适用于连续性指标、离散有序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    try:
        data[factor_name].astype(float)
        data = data[data[factor_name].notnull()]
        auc = roc_auc_score(data[flag_name], data[factor_name])
        ar = 2 * auc - 1
        return ar
    except:
        return None


def pearson_chi2(data, flag_name, factor_name, discrete_list):
    """
    Explanation
    ----------
    计算皮尔逊卡方检验的p-value,服从 x2(len(指标取值类别)-1) 分布

    Remark
    (1) 适用于离散有序型指标、离散无序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    data = data[data[factor_name].notnull()]
    data1 = group_by_df(data, flag_name, factor_name, 'bad', 'good', discrete_list=discrete_list)
    good = data1['good'].sum()
    bad = data1['bad'].sum()
    n = good + bad
    data1['good_new'] = map(lambda x, y: ((x - good * (x + y) / float(n)) ** 2) / (good * (x + y) / float(n)),
                            data1['good'], data1['bad'])
    data1['bad_new'] = map(lambda x, y: ((y - bad * (x + y) / float(n)) ** 2) / (bad * (x + y) / float(n)),
                           data1['good'], data1['bad'])
    chi = data1['good_new'].sum() + data1['bad_new'].sum()
    return scipy.stats.chi2.sf(chi, len(data1) - 1)


def IV(data, flag_name, factor_name, discrete_list):
    """
    Explanation
    ----------
    计算IV

    Remark
    (1) 适用于连续性指标、离散有序型指标、离散无序型指标
    (2) 适用于粗分类后的新指标(哑变量或者woe证据权重)
    (3) 如果指标中存在缺失值,剔除缺失值计算
    """
    data = data[data[factor_name].notnull()]
    data1 = group_by_df(data, flag_name, factor_name, 'bad', 'good', discrete_list=discrete_list)
    good = float(sum(data1['good']))
    bad = float(sum(data1['bad']))
    data1['iv'] = list(map(lambda x, y: (x / good - y / bad) * numpy.log(x * bad / (good * y)) if x * y != 0 else 0,
                      data1['good'], data1['bad']))
    return data1['iv'].sum()


def main():
    df = pd.read_csv('/Users/yangchen/Desktop/loan_data/wx_daiqian_label.csv')
    X = df.drop(['ENCODE_CERT_CODE', 'ENCODE_MOBILE_PHONE', 'province', 'LOAN_START_DATE'], axis=1)
    cols = X.columns
    result_list = {}
    for col in cols:
        value = IV(X, 'label', col,[])
        result_list[col] = value
        # print(col + ' : ' + str(value))
    dic_sort = sorted(result_list.items(), key=lambda item: item[0], reverse=True)
    print(dic_sort)

if __name__ == '__main__':
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值