1、项目涉及到的一些函数
# -*- coding: utf-8 -*-
# %%time
# from pyhive import presto
import pandas as pd
import numpy as np
import warnings
import os
from pyhive import presto
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
from sklearn.externals import joblib
from joblib import Parallel,delayed
import scorecardpy as sc
import toad
import datetime, calendar
import time
from datetime import timedelta, date
from time import *
import toad
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
from pandas import DataFrame
from sklearn.tree import _tree
from functools import reduce
%matplotlib inline
#透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比,agg内可以选,count,min,max,sum,mean,len
#按照列为index,行为columns进行透视展示values内对象的分布情况,空值填充0
# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count'])
#设置行,列,列宽等
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth', 500)
cursor=presto.connect('IP',端口,'jobschedule').cursor()
def read_sql(hql):
cursor.execute(hql)
try:
result = cursor.fetchall()
num_columns = len(cursor.description)
columns_names = [i[0] for i in cursor.description]
except Exception as e:
print(num_columns)
print(columns_names)
data = pd.DataFrame(list(result),columns=columns_names)
return data
####################### PlotKS ##########################
def PlotKS(preds, labels, n=20, asc=True):
# preds is score: asc=1
# preds is prob: asc=0
pred = preds # 预测值
bad = labels # 取1为bad, 0为good
ksds = pd.DataFrame({'bad': bad, 'pred': pred})
ksds['good'] = 1 - ksds.bad
if asc == 1:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
elif asc == 0:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
ksds1.index = range(len(ksds1.pred))
ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)
ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)
if asc == 1:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
elif asc == 0:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
ksds2.index = range(len(ksds2.pred))
ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)
ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)
# ksds1 ksds2 -> average
ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]
ksds['cumsum_good2'] = ksds2['cumsum_good2']
ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2
# ks
ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
ksds['tile0'] = range(1, len(ksds.ks) + 1)
ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])
qe = list(np.arange(0, 1, 1.0/n))
qe.append(1)
qe = qe[1:]
ks_index = pd.Series(ksds.index)
ks_index = ks_index.quantile(q = qe)
ks_index = np.ceil(ks_index).astype(int)
ks_index = list(ks_index)
ksds = ksds.loc[ks_index]
ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
ksds0 = np.array([[0, 0, 0, 0]])
ksds = np.concatenate([ksds0, ksds], axis=0)
ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])
ks_value =abs(ksds.ks).max() #20200502-调整增加abs
ks_pop = ksds.tile[ksds.ks.idxmax()]
print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))
# chart
plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',
color='blue', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',
color='red', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.ks, label='ks',
color='green', linestyle='-', linewidth=2)
plt.axvline(ks_pop, color='gray', linestyle='--')
plt.axhline(ks_value, color='green', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')
plt.ti