python d rs normal_Python随笔-pandas、风控评分卡

该博客介绍了Python在数据分析中的应用,包括使用Pandas、Numpy和Matplotlib等库进行数据处理和可视化。文中详细讲解了如何进行数据读取、透视表、分箱、KS曲线绘制以及评分卡的构建和规则提取。此外,还涉及了决策树的使用和规则提取,以及如何利用scorecardpy库创建评分卡。
摘要由CSDN通过智能技术生成

1、项目涉及到的一些函数

# -*- coding: utf-8 -*-

# %%time

# from pyhive import presto

import pandas as pd

import numpy as np

import warnings

import os

from pyhive import presto

import matplotlib.pyplot as plt

import sys

from tqdm import tqdm

from sklearn.externals import joblib

from joblib import Parallel,delayed

import scorecardpy as sc

import toad

import datetime, calendar

import time

from datetime import timedelta, date

from time import *

import toad

import pydotplus

from IPython.display import Image

from sklearn.externals.six import StringIO

from sklearn import tree

from pandas import DataFrame

from sklearn.tree import _tree

from functools import reduce

%matplotlib inline

#透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比,agg内可以选,count,min,max,sum,mean,len

#按照列为index,行为columns进行透视展示values内对象的分布情况,空值填充0

# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count'])

#设置行,列,列宽等

pd.set_option('display.max_rows',None)

pd.set_option('display.max_columns',None)

pd.set_option('max_colwidth', 500)

cursor=presto.connect('IP',端口,'jobschedule').cursor()

def read_sql(hql):

cursor.execute(hql)

try:

result = cursor.fetchall()

num_columns = len(cursor.description)

columns_names = [i[0] for i in cursor.description]

except Exception as e:

print(num_columns)

print(columns_names)

data = pd.DataFrame(list(result),columns=columns_names)

return data

####################### PlotKS ##########################

def PlotKS(preds, labels, n=20, asc=True):

# preds is score: asc=1

# preds is prob: asc=0

pred = preds # 预测值

bad = labels # 取1为bad, 0为good

ksds = pd.DataFrame({'bad': bad, 'pred': pred})

ksds['good'] = 1 - ksds.bad

if asc == 1:

ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])

elif asc == 0:

ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])

ksds1.index = range(len(ksds1.pred))

ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)

ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)

if asc == 1:

ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])

elif asc == 0:

ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])

ksds2.index = range(len(ksds2.pred))

ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)

ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)

# ksds1 ksds2 -> average

ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]

ksds['cumsum_good2'] = ksds2['cumsum_good2']

ksds['cumsum_bad2'] = ksds2['cumsum_bad2']

ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2

ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2

# ks

ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']

ksds['tile0'] = range(1, len(ksds.ks) + 1)

ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])

qe = list(np.arange(0, 1, 1.0/n))

qe.append(1)

qe = qe[1:]

ks_index = pd.Series(ksds.index)

ks_index = ks_index.quantile(q = qe)

ks_index = np.ceil(ks_index).astype(int)

ks_index = list(ks_index)

ksds = ksds.loc[ks_index]

ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]

ksds0 = np.array([[0, 0, 0, 0]])

ksds = np.concatenate([ksds0, ksds], axis=0)

ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])

ks_value =abs(ksds.ks).max() #20200502-调整增加abs

ks_pop = ksds.tile[ksds.ks.idxmax()]

print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))

# chart

plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',

color='blue', linestyle='-', linewidth=2)

plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',

color='red', linestyle='-', linewidth=2)

plt.plot(ksds.tile, ksds.ks, label='ks',

color='green', linestyle='-', linewidth=2)

plt.axvline(ks_pop, color='gray', linestyle='--')

plt.axhline(ks_value, color='green', linestyle='--')

plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')

plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')

plt.ti

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值