如果特征列名重复,进行删除操作
if data_all_feature_df['advance新增多头label1'].shape[1]>=2:
data_all_feature_df['advance新增多头label'] = data_all_feature_df['advance新增多头label1'].max(axis=1)
data_all_feature_df.drop(columns=['advance新增多头label1'], inplace=True)
data_all_feature_df['advance新增多头label1'] = data_all_feature_df['advance新增多头label']
# pandas dataFrame[col].value_counts() 类似
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])
哑变量处理,指定要处理的特征,以及列前缀
for fea in feature_cols:
df = pd.get_dummies(df,columns=[fea],prefix=fea)
如下格式尽量避免使用,因为在jion的时候有bug
embark_dummies= pd.get_dummies(train_data['Embarked'])
#get_dummies()该列有多少种可能值就用多少列表示,一列代表一种可能值,与独热编码相似
train_data = train_data.join(embark_dummies)
train_data.drop(['Embarked'],axis=1,inplace=True)
embark_dummies = train_data[['S','C','Q']]
embark_dummies.head()
categorical_features = 'all'
,这个参数指定了对哪些特征进行编码,默认对所有类别都进行编码。也可以自己指定选择哪些特征,通过索引或者 bool 值来指定,看下例:-
# -*- coding: utf-8 -*-
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categorical_features = [0,2]) # 等价于 [True, False, True]
enc.fit([[0, 0, 3],
[1, 1, 0],
[0, 2, 1],
[1, 0, 2]])ans = enc.transform([[0, 2, 3]]).toarray()
#等频切割变量
def bin_frequency(x,y,n=10): # x为待分箱的变量,y为target变量.n为分箱数量
total = y.count() #1 计算总样本数
bad = y.sum() #2 计算坏样本数
good = total-bad #3 计算好样本数
if x.value_counts().shape[0]==2: #4 如果该变量值是0和1则只分两组
d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.cut(x,2)})
else:
d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.qcut(x,n,duplicates='drop')}) #5 用pd.cut实现等频分箱
d2 = d1.groupby('bucket',as_index=True) #6 按照分箱结果进行分组聚合
d3 = pd.DataFrame(d2.x.min(),columns=['min_bin'])
d3['min_bin'] = d2.x.min() #7 箱体的左边界
d3['max_bin'] = d2.x.max() #8 箱体的右边界
d3['bad'] = d2.y.sum() #9 每个箱体中坏样本的数量
d3['total'] = d2.y.count() #10 每个箱体的总样本数
d3['bad_rate'] = d3['bad']/d3['total'] #11 每个箱体中坏样本所占总样本数的比例
d3['badattr'] = d3['bad']/bad #12 每个箱体中坏样本所占坏样本总数的比例
d3['goodattr'] = (d3['total'] - d3['bad'])/good #13 每个箱体中好样本所占好样本总数的比例
d3['WOEi'] = np.log(d3['badattr']/d3['goodattr']) #14 计算每个箱体的woe值
IV = ((d3['badattr']-d3['goodattr'])*d3['WOEi']).sum() #15 计算变量的iv值
d3['IVi'] = (d3['badattr']-d3['goodattr'])*d3['WOEi'] #16 计算IV
d4 = (d3.sort_values(by='min_bin')).reset_index(drop=True) #17 对箱体从大到小进行排序
cut = []
cut.append(float('-inf'))
for i in d4.min_bin:
cut.append(i)
cut.append(float('inf'))
WOEi = list(d4['WOEi'].round(3))
return IV,cut,WOEi,d4
columns_iv = [
'7天内申请人在多个平台申请借款',
'1个月内申请人在多个平台申请借款',
'3个月内申请人在多个平台申请借款',
'7天内关联P2P网贷平台数',
'1个月内关联P2P网贷平台数',
'3个月内关联P2P网贷平台数',
'X3个月内申请人手机号作为第二联系人手机号出现的次数',
'X3个月内申请人手机号作为前三联系人手机号出现的次数'
]
ivs=[]
for i in columns_iv:
print(i)
IV,cut,WOEi,d4 = bin_frequency(df[i], df['flag'])
print('IV=', IV)
ivs.append(IV)
print(d4)
#自定义切割变量
def bin_cut(x,y,cut): # x为待分箱的变量,y为target变量.cut为分箱的切割点
total = y.count() # 计算总样本数
bad = y.sum() # 计算坏样本数
good = total-bad # 计算好样本数
bucket = cut
if x.value_counts().shape[0]==2:
d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.cut(x,2)})
else:
d1 = pd.DataFrame({'x':x,'y':y,'bucket':pd.cut(x,cut)}) # 用pd.cut实现分箱
d2 = d1.groupby('bucket',as_index=True) # 按照分箱结果进行分组聚合
d3 = pd.DataFrame(d2.x.min(),columns=['min_bin'])
d3['min_bin'] = d2.x.min() # 箱体的左边界
d3['max_bin'] = d2.x.max() # 箱体的右边界
d3['bad'] = d2.y.sum() # 每个箱体中坏样本的数量
d3['total'] = d2.y.count() # 每个箱体的总样本数
d3['bad_rate'] = d3['bad']/d3['total'] # 每个箱体中坏样本所占总样本数的比例
d3['badattr'] = d3['bad']/bad # 每个箱体中坏样本所占坏样本总数的比例
d3['goodattr'] = (d3['total'] - d3['bad'])/good # 每个箱体中好样本所占好样本总数的比例
d3['WOEi'] = np.log(d3['badattr']/d3['goodattr']) # 计算每个箱体的woe值
IV = ((d3['badattr']-d3['goodattr'])*d3['WOEi']).sum() # 计算变量的iv值
d3['IVi'] = (d3['badattr']-d3['goodattr'])*d3['WOEi']
d4 = (d3.sort_values(by='min_bin')).reset_index(drop=True) # 对箱体从大到小进行排序
cut = []
cut.append(float('-inf'))
for i in d4.min_bin:
cut.append(i)
cut.append(float('inf'))
WOEi = list(d4['WOEi'].round(3))
return IV,cut,WOEi,d4
IV,cut,WOEi,d4 = bin_cut(df['3个月内关联P2P网贷平台数'], df['y'],[-1,0,1,4,40])
参考:风控建模中的IV和WOE_阿黎逸阳的博客-CSDN博客_iv woe
pandas 一个维度,多个统计指标聚合,并重置索引:
age_answer_rat = call_status_df.groupby('age').agg({"answer_statuss":['count','mean']})
prefix=''
age_answer_rat.columns = [prefix + '_'.join(col).strip() for col in age_answer_rat.columns.values]
age_answer_rat.reset_index(drop=False, inplace=True)
age_answer_rat.head()
pandas时间差 :
timediff = pd.Timedelta('2 days 2 hours 15 minutes 30 seconds')
str(pd.Timestamp('2022-06-04') - pd.Timedelta('3 days'))
groupby后通过apply自定义聚合操作
-
addr_num = data_person.groupby(['一级地址']).apply(lambda x:x['身份证号码'].count()).reset_index(name = '人数').sort_values(by=['人数'])
-
addr_sum_overdue = data_person.groupby(['一级地址']).apply(lambda x:x['逾期金额'].sum()).reset_index(name = '逾期总金额')
-
addr_sum_un_overdue = data_person.groupby(['一级地址']).apply(lambda x:x['借贷金额'].sum()-x['逾期金额'].sum()).reset_index(name = '未逾期总金额')
模型发布上线后,由于时间因素或者是数据特征获取差异,导致模型评分有波动,可以通过计算模型的PSI来衡量模型的稳定性,此外也可以通过计算打分分布是否有较大的差异,以及最终方案:对同一个用户进行打分,看先后的评分是否一致。
PSI计算:等宽&等频
from locale import normalize
import numpy as np
import pandas as pd
def psi_1d_by_equal_frequency(arr, arr_old, bins=10, roundn=6, inf=True):
""" 等频计算两个数值变量的PSI(population stability index by equal frequency)
Args:
arr: 1维数组
arr_old: 不同时期的1维数组取值,基准数据(对照组,或者是训练集)
bins (int or list): 分箱数量
roundn (int): 保留的小数位数
inf (boolean): 边界值是否包含无穷
Returns:
float
"""
cutoffs = arr_old.quantile(np.linspace(0, 1, bins + 1))
if inf:
# 更新边界值为-np.inf和np.inf
cutoffs[0], cutoffs[-1] = -np.inf, np.inf
cutoffs = cutoffs.round(roundn).unique()
arr_stats = pd.cut(arr, cutoffs).value_counts(normalize=True)
arr_old_stats = pd.cut(arr_old, cutoffs).value_counts(normalize=True)
psi = np.sum((arr_stats - arr_old_stats) * np.log(arr_stats / arr_old_stats))
return psi
# ref https://zhuanlan.zhihu.com/p/161638711
def psi_calc(actual,predict,bins=10):
'''
功能: 计算模型输出PSI值,并输出实际和预期占比分布曲线,等宽切分
输入值:
actual: 一维数组或series,代表训练集模型得分
predict: 一维数组或series,代表测试集模型得分
bins: 违约率段划分个数
输出值:
字典,键值关系为{'psi': PSI值,'psi_fig': 实际和预期占比分布曲线}
'''
psi_dict = {}
actual = np.sort(actual)
predict = np.sort(predict)
actual_len = len(actual)
predict_len = len(predict)
psi_cut = []
actual_bins = []
predict_bins = []
actual_min = actual.min()
actual_max = actual.max()
cuts = []
binlen = (actual_max-actual_min) / bins
for i in range(1, bins):
cuts.append(actual_min+i*binlen)
for i in range(1, (bins+1)):
if i == 1:
lowercut = float('-Inf')
uppercut = cuts[i-1]
elif i == bins:
lowercut = cuts[i-2]
uppercut = float('Inf')
else:
lowercut = cuts[i-2]
uppercut = cuts[i-1]
actual_cnt = ((actual >= lowercut) & (actual < uppercut)).sum()+1
predict_cnt = ((predict >= lowercut) & (predict < uppercut)).sum()+1
actual_pct = (actual_cnt+0.0) / actual_len
predict_pct = (predict_cnt+0.0) / predict_len
psi_cut.append((actual_pct-predict_pct) * math.log(actual_pct/predict_pct))
actual_bins.append(actual_pct)
predict_bins.append(predict_pct)
psi = sum(psi_cut)
nbins = len(actual_bins)
xlab = np.arange(1, nbins+1)
fig = plt.figure()
plt.plot(xlab, np.array(actual_bins),'r',label='actual')
plt.plot(xlab, np.array(predict_bins),'b',label='predict')
plt.legend(loc='best')
plt.title('Psi Curve')
plt.close()
psi_dict['psi'] = psi
psi_dict['psi_fig'] = fig
return psi_dict
import math
###计算单变量特征psi
def fea_psi_calc(actual,predict,bins=10):
'''
功能: 计算连续变量和离散变量的PSI值
输入值:
actual: 一维数组或series,代表训练集中的变量
predict: 一维数组或series,代表测试集中的变量
bins: 违约率段划分个数
输出值:
字典,键值关系为{'psi': PSI值,'psi_fig': 实际和预期占比分布曲线}
'''
psi_dict = {}
actual = np.sort(actual)
actual_distinct = np.sort(list(set(actual)))
predict = np.sort(predict)
predict_distinct = np.sort(list(set(predict)))
actual_len = len(actual)
actual_distinct_len = len(actual_distinct)
predict_len = len(predict)
predict_distinct_len = len(predict_distinct)
psi_cut = []
actual_bins = []
predict_bins = []
actual_min = actual.min()
actual_max = actual.max()
cuts = []
binlen = (actual_max-actual_min) / bins
if (actual_distinct_len<bins):
for i in actual_distinct:
cuts.append(i)
for i in range(2, (actual_distinct_len+1)):
if i == bins:
lowercut = cuts[i-2]
uppercut = float('Inf')
else:
lowercut = cuts[i-2]
uppercut = cuts[i-1]
actual_cnt = ((actual >= lowercut) & (actual < uppercut)).sum()+1
predict_cnt = ((predict >= lowercut) & (predict < uppercut)).sum()+1
actual_pct = (actual_cnt+0.0) / actual_len
predict_pct = (predict_cnt+0.0) / predict_len
psi_cut.append((actual_pct-predict_pct) * math.log(actual_pct/predict_pct))
actual_bins.append(actual_pct)
predict_bins.append(predict_pct)
else:
for i in range(1, bins):
cuts.append(actual_min+i*binlen)
for i in range(1, (bins+1)):
if i == 1:
lowercut = float('-Inf')
uppercut = cuts[i-1]
elif i == bins:
lowercut = cuts[i-2]
uppercut = float('Inf')
else:
lowercut = cuts[i-2]
uppercut = cuts[i-1]
actual_cnt = ((actual >= lowercut) & (actual < uppercut)).sum()+1
predict_cnt = ((predict >= lowercut) & (predict < uppercut)).sum()+1
actual_pct = (actual_cnt+0.0) / actual_len
predict_pct = (predict_cnt+0.0) / predict_len
psi_cut.append((actual_pct-predict_pct) * math.log(actual_pct/predict_pct))
actual_bins.append(actual_pct)
predict_bins.append(predict_pct)
psi = sum(psi_cut)
nbins = len(actual_bins)
xlab = np.arange(1, nbins+1)
psi_dict['psi'] = psi
return psi_dict
columns_select = [i for i in X_columns]
fea_psi_compare = pd.DataFrame(index=['df_train'],columns=columns_select)
for column in columns_select:
psi = fea_psi_calc(df_train[column].dropna(),df_test[column].dropna())['psi']
fea_psi_compare.loc['df_train',column] = psi
print(column,psi)
fea_psi_compare_T = fea_psi_compare.T;
fea_psi_compare_T[fea_psi_compare_T['df_train']>1].sort_values(by='df_train',ascending=False).head()
psi_drop_columns = fea_psi_compare_T[fea_psi_compare_T['df_train']>0.25].sort_values(by='df_train',ascending=False).index.tolist()
print('psi_drop_columns:{}'.format(psi_drop_columns))
pandas groupby同时制定列名
pandas groupby增加全局汇总:
def get_n_group_overdue(data, prob_col='prob', label_col='overdue', n_group=10, need_total=True):
data = data[[prob_col,label_col]].copy()
data.sort_values(by=prob_col, ascending=True, inplace=True)
_, col_bins = pd.qcut(data[prob_col], q=n_group, duplicates='drop', retbins=True)
data[prob_col] = pd.cut(data[prob_col], bins=col_bins)
data_agg_df = data.groupby([prob_col]).agg({label_col:['count','sum','mean']})
data_agg_df.columns = ['_'.join(e) for e in data_agg_df.columns]
data_agg_df.reset_index(inplace=True)
# ruleset_111_mexico_loan_xgboost_score first1_overdue3_count first1_overdue3_sum first1_overdue3_mean
if need_total:
total_static_df = pd.DataFrame([['汇总', data[label_col].count(),data[label_col].sum(),data[label_col].mean()]],columns=data_agg_df.columns)
total_static_df.index = [data_agg_df.shape[0]]
data_agg_df = pd.concat([data_agg_df,total_static_df],axis=0)
return data_agg_df
# 计算借款人weight:
def cal_weight_plain(x, max_overdue_days):
# 对于最大逾期天数为n时满足要求订单中,逾期天数超过n的账单本金之和除以该订单金额这个比例作为权重。
# 例如一个N=6期首贷订单,n=3,即maxoverduedays=3,第二期第三期的账单逾期超过3天,其他期都逾期小于3天,那权重就是2/6=33.3%
# 1)不考虑部分借款人没有完整N期表现
# 2)对没有完整N期表现的客户 weight*(max_behavior_num/max_term_num)
# 3)因为对逾期用户的逾期表现做了权重,但是针对,没有逾期的用户权重为0,所以后续需要做进一步的处理
return float((x['overdue_days']>=max_overdue_days).sum())/x.shape[0] ## (overdue days >=3的期数)/(该用户所有的期数)
def normalize_weight(s):
# 对因为cal_weight_plain计算的weight_overdue3正样本都为0,需要进行标准化
value_nums = s.value_counts()
# value_nums.loc[0] 正样本数量,即没有逾期的用户
zero_map_bal = (value_nums.index * value_nums).sum()/value_nums.loc[0]
# value_nums[value_nums.index!=0] 负样本数量,即有逾期的用户
zero_map_imbal = (value_nums.index * value_nums).sum()/value_nums[value_nums.index!=0].sum()
return s.where(s>0, zero_map_bal), s.where(s>0, zero_map_imbal)
def normalize_weight_pos0(s, weight_col, label, default_v=1/10):
s_mirror = s.copy()
# 当label 和weight_col 不是同一逾期天数的时候,才有差异
# 3天未逾期的时候,7天
print(weight_col, label,s_mirror.loc[(s_mirror[label]==1) & (s_mirror[weight_col]==0), weight_col].head())
s_mirror.loc[(s_mirror[label]==1) & (s_mirror[weight_col]==0), weight_col]=default_v
return normalize_weight(s_mirror[weight_col])
def cal_weight_scale(x):
weight = float((x['overdue_days']>=max_overdue_days).sum())/x.shape[0]
# 2017年没有6期产品
if x.shape[0] < 6:
weight *= float(x.shape[0])/6
return weight
def cal_weight_vintage(x):
return float((x['overdue_days']>=max_overdue_days).sum())/x.shape[0]