读数据
df = pd.read_csv(“E:\python\data\GiveMeSomeCredit\cs-training.csv”)
df1 = df.drop_duplicates()#删除重复项
想要在Jupyter Notebook中获取当前所在位置的绝对路径。
%pwd或!pwd
排序
df.sort_values(by=[‘ColName’], ascending=False)
转换类型
import pandas as pd
list = [‘count’]
for i in list:
data[i]=data[i].astype(‘float’)
#数据类型转换
data_train[‘Numdepend’]=data_train[‘Numdepend’].astype(‘int64’)
data_train[‘Withdepend’]=data_train[‘Withdepend’].astype(‘int64’)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg’上面的最后一行指定了输出的格式是 svg,这样浏览器就能为你渲染矢量图了
假设字典:
dict = {3: 9, 2: 9, 1: 8, 0: 4}
print(dict.keys())
输出为 dictkey= dict_keys([3, 2, 1, 0])
如果需要提取keys中的字段的话,需要list的转化
list(dict.keys())
输出为 [3, 2, 1, 0]
newlist = []
for i in newdict.keys():
newlist.append(i)
group :
data.groupby(‘year2’)[‘amt’].count()
取唯一去重值:
numbers = [1, 2, 2, 3, 3, 4, 5]
def get_unique_numbers(numbers):
list_of_unique_numbers = []
unique_numbers = set(numbers)
for number in unique_numbers:
list_of_unique_numbers.append(number)
return list_of_unique_numbers
print(get_unique_numbers(numbers))
result: [1, 2, 3, 4, 5]
方法2:
for number in unique_numbers:
list_of_unique_numbers.append(number)
方法3:
unique_numbers = list(set(numbers))
for number in numbers:
if number in unique:
continue
else:
unique.append(number)
if number not in unique:
unique.append(number)
#加载包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
描述
性
data_train.describe()
#删列
#选择子集
data_train.drop(‘Unnamed: 0’,axis=1,inplace=True)
data_train.head()
#列名重命名
colnames={‘SeriousDlqin2yrs’:‘Isdlq’,
‘RevolvingUtilizationOfUnsecuredLines’:‘Revol’,
‘NumberOfTime30-59DaysPastDueNotWorse’:‘Num30-59late’,
‘NumberOfOpenCreditLinesAndLoans’:‘Numopen’,
‘NumberOfTimes90DaysLate’:‘Num90late’,
‘NumberRealEstateLoansOrLines’:‘Numestate’,
‘NumberOfTime60-89DaysPastDueNotWorse’:‘Num60-89late’,
‘NumberOfDependents’:‘Numdepend’}
data_train.rename(columns=colnames,inplace=True)
可视化统计count```
sns.countplot(‘Isdlq’,data=data_train)
badNum=data_train.loc[data_train[‘Isdlq’]==1,:].shape[0]
goodNum=data_train.loc[data_train[‘Isdlq’]==0,:].shape[0]
print(‘好坏比:{0}%’.format(round(badNum*100/(goodNum+badNum),2)))
#Age数据分布情况
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.distplot(data_train[‘age’],ax=ax1)
sns.boxplot(y=‘age’,data=data_train,ax=ax2)
分组看一下age各阶段违约率情况
data_age=data_train.loc[data_train[‘age’]>0,[‘age’,‘Isdlq’]]
data_age.loc[(data_age[‘age’]>18)&(data_age[‘age’]<40),‘age’] = 1
data_age.loc[(data_age[‘age’]>=40)&(data_age[‘age’]<60),‘age’] = 2
data_age.loc[(data_age[‘age’]>=60)&(data_age[‘age’]<80),‘age’] = 3
data_age.loc[(data_age[‘age’]>=80),‘age’] = 4
age_Isdlq=data_age.groupby(‘age’)[‘Isdlq’].sum()
age_total=data_age.groupby(‘age’)[‘Isdlq’].count()
age_Isratio=age_Isdlq/age_total
age_Isratio.plot(kind=‘bar’,figsize=(8,6),color=‘#4682B4’)
可视化散点图
#Revol数据分布
figure=plt.figure(figsize=(8,6))
plt.scatter(data_train[‘Revol’],data_train[‘age’])
plt.grid()
按列条件取数
#age异常值处理
data_train=data_train[data_train[‘age’]>0]
#Numdepend缺失值处理
data_train[‘Numdepend’]=data_train[‘Numdepend’].fillna(‘0’)
#MonthlyIncome缺失值处理
#随机森林预测缺失值
data_Forest=data_train.iloc[:,[5,1,2,3,4,6,7,8,9]]
MonthlyIncome_isnull=data_Forest.loc[data_train[‘MonthlyIncome’].isnull(),:]
MonthlyIncome_notnull=data_Forest.loc[data_train[‘MonthlyIncome’].notnull(),:]
from sklearn.ensemble import RandomForestRegressor
X=MonthlyIncome_notnull.iloc[:,1:].values
y=MonthlyIncome_notnull.iloc[:,0].values
regr=RandomForestRegressor(max_depth=3, random_state=0,n_estimators=200,n_jobs=-1)
regr.fit(X,y)
MonthlyIncome_fillvalue=regr.predict(MonthlyIncome_isnull.iloc[:,1:].values).round(0)
#填充MonthlyIncome缺失值
data_train.loc[data_train[‘MonthlyIncome’].isnull(),‘MonthlyIncome’]=MonthlyIncome_fillvalue
#衍生变量
data_train[‘AllNumlate’]=data_train[‘Num30-59late’]+data_train[‘Num60-89late’]+data_train[‘Num90late’]
data_train[‘Monthlypayment’]=data_train[‘DebtRatio’]*data_train[‘MonthlyIncome’]
data_train[‘Withdepend’]=data_train[‘Numdepend’]
系数矩阵
corr=data_train.corr()
plt.figure(figsize=(14,12))
sns.heatmap(corr,annot=True,linewidths=.3,cmap=‘YlGnBu’)
woe\iv值
def bin_woe(tar, var, n=None, cat=None):
“”"
连续自变量分箱,woe,iv变换
tar:target目标变量
var:进行woe,iv转换的自变量
n:分组数量
“”"
total_bad = tar.sum()
total_good =tar.count()-total_bad
totalRate = total_good/total_bad
if cat == 's':
msheet = pd.DataFrame({tar.name:tar,var.name:var,'var_bins':pd.qcut(var, n, duplicates='drop')})
grouped = msheet.groupby(['var_bins'])
elif (cat == 'd') and (n is None):
msheet = pd.DataFrame({tar.name:tar,var.name:var})
grouped = msheet.groupby([var.name])
groupBad = grouped.sum()[tar.name]
groupTotal = grouped.count()[tar.name]
groupGood = groupTotal - groupBad
groupRate = groupGood/groupBad
groupBadRate = groupBad/groupTotal
groupGoodRate = groupGood/groupTotal
woe = np.log(groupRate/totalRate)
iv = np.sum((groupGood/total_good-groupBad/total_bad)*woe)
if cat == 's':
new_var, cut = pd.qcut(var, n, duplicates='drop',retbins=True, labels=woe.tolist())
elif cat == 'd':
dictmap = {}
for x in woe.index:
dictmap[x] = woe[x]
new_var, cut = var.map(dictmap), woe.index
return woe.tolist(), iv, cut, new_var
确定变量类型,连续变量还是离散变量
dvar = [‘Revol’,‘DebtRatio’,‘Num30-59late’, ‘Num60-89late’,‘Num90late’,‘AllNumlate’,‘Withdepend’,
‘Numestate’,‘Numdepend’]
svar = [‘MonthlyIncome’,‘age’,‘Monthlypayment’,‘Numopen’]
可视化woe得分和iv得分
def woe_vs(data):
cutdict = {}
ivdict = {}
woe_dict = {}
woe_var = pd.DataFrame()
for var in data.columns:
if var in dvar:
woe, iv, cut, new = bin_woe(data[‘Isdlq’], data[var], cat=‘d’)
woe_dict[var] = woe
woe_var[var] = new
ivdict[var] = iv
cutdict[var] = cut
elif var in svar:
woe, iv, cut, new = bin_woe(data[‘Isdlq’], data[var], n=5, cat=‘s’)
woe_dict[var] = woe
woe_var[var] = new
ivdict[var] = iv
cutdict[var] = cut
ivdict = sorted(ivdict.items(), key=lambda x:x[1], reverse=False)
iv_vs = pd.DataFrame([x[1] for x in ivdict],index=[x[0] for x in ivdict],columns=['IV'])
ax = iv_vs.plot(kind='barh',
figsize=(12,12),
title='Feature IV',
fontsize=10,
width=0.8,
color='#00688B')
ax.set_ylabel('Features')
ax.set_xlabel('IV of Features')
return ivdict, woe_var, woe_dict, cutdict
woe转化
ivinfo, woe_data, woe_dict, cut_dict = woe_vs(data_train)
#分离训练集和测试集
from sklearn.model_selection import train_test_split
IV_info=[‘Num60-89late’,‘Num90late’,‘AllNumlate’,‘Revol’,‘age’]
X=woe_data[IV_info]
y=data_train[‘Isdlq’]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)
#Logistic模型建立
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(random_state=0,
solver=“sag”,
penalty=“l2”,
class_weight=“balanced”,
C=1.0,
max_iter=500)
model.fit(X_train, y_train)
model_proba = model.predict_proba(X_test)#predict_proba返回的结果是一个数组,包含两个元素,第一个元素是标签为0的概率值,第二个元素是标签为1的概率值
model_score=model_proba[:,1]
#绘制ROC曲线,计算AUC值
from sklearn.metrics import roc_curve, roc_auc_score
fpr,tpr,thresholds =roc_curve(y_test,model_score)
auc_score=roc_auc_score(y_test,model_score)
plt.plot(fpr, tpr, linewidth=2, label=‘AUC = %0.2f’%auc_score)
plt.plot([0,1],[0,1], “k–”)
plt.axis([0,1,0,1])
plt.xlabel(‘FPR’)
plt.ylabel(‘TPR’)
plt.legend()
intercept=model.intercept_
coef=model.coef_
coe=coef[0].tolist()
coe_df=pd.DataFrame({‘feature’:IV_info,‘coe’:coe})
import math
B=20/math.log(2)
A=600+Bmath.log(1/20)
#基础分
score=round(A-Bintercept[0],0)
featurelist = []
woelist = []
cutlist = []
for k,v in woe_dict.items():
if k in IV_info:
for n in range(0,len(v)):
featurelist.append(k)
woelist.append(v[n])
cutlist.append(cut_dict[k][n])
scoreboard = pd.DataFrame({‘feature’:featurelist,‘woe’:woelist,‘cut’:cutlist},
columns=[‘feature’,‘cut’,‘woe’])
score_df=pd.merge(scoreboard,coe_df)
score_df[‘score’]=round(-B*score_df[‘woe’]*score_df[‘coe’],0)
score_df.drop(‘coe’,axis=1,inplace=True)
score_df
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
#通过read_csv来读取我们的目的数据集
adv_data = pd.read_csv("C:/Users/Administrator/Desktop/Advertising.csv")
#清洗不需要的数据
new_adv_data = adv_data.ix[:,1:]
#得到我们所需要的数据集且查看其前几列以及数据形状
print('head:',new_adv_data.head(),'\nShape:',new_adv_data.shape)
#数据描述
print(new_adv_data.describe())
#缺失值检验
print(new_adv_data[new_adv_data.isnull()==True].count())
new_adv_data.boxplot()
plt.savefig("boxplot.jpg")
plt.show()
##相关系数矩阵 r(相关系数) = x和y的协方差/(x的标准差*y的标准差) == cov(x,y)/σx*σy
#相关系数0~0.3弱相关0.3~0.6中等程度相关0.6~1强相关
print(new_adv_data.corr())
#建立散点图来查看数据集里的数据分布
#seaborn的pairplot函数绘制X的每一维度和对应Y的散点图。通过设置size和aspect参数来调节显示的大小和比例。
# 可以从图中看出,TV特征和销量是有比较强的线性关系的,而Radio和Sales线性关系弱一些,Newspaper和Sales线性关系更弱。
# 通过加入一个参数kind='reg',seaborn可以添加一条最佳拟合直线和95%的置信带。
sns.pairplot(new_adv_data, x_vars=['TV','radio','newspaper'], y_vars='sales', size=7, aspect=0.8,kind = 'reg')
plt.savefig("pairplot.jpg")
plt.show()
#利用sklearn里面的包来对数据集进行划分,以此来创建训练集和测试集
#train_size表示训练集所占总数据集的比例
X_train,X_test,Y_train,Y_test = train_test_split(new_adv_data.ix[:,:3],new_adv_data.sales,train_size=.80)
print("原始数据特征:",new_adv_data.ix[:,:3].shape,
",训练数据特征:",X_train.shape,
",测试数据特征:",X_test.shape)
print("原始数据标签:",new_adv_data.sales.shape,
",训练数据标签:",Y_train.shape,
",测试数据标签:",Y_test.shape)
model = LinearRegression()
model.fit(X_train,Y_train)
a = model.intercept_#截距
b = model.coef_#回归系数
print("最佳拟合线:截距",a,",回归系数:",b)
#y=2.668+0.0448∗TV+0.187∗Radio-0.00242∗Newspaper
#R方检测
#决定系数r平方
#对于评估模型的精确度
#y误差平方和 = Σ(y实际值 - y预测值)^2
#y的总波动 = Σ(y实际值 - y平均值)^2
#有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动
#有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方
#对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化)
#2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合
score = model.score(X_test,Y_test)
print(score)
#对线性回归进行预测
Y_pred = model.predict(X_test)
print(Y_pred)
plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict")
#显示图像
# plt.savefig("predict.jpg")
plt.show()
plt.figure()
plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict")
plt.plot(range(len(Y_pred)),Y_test,'r',label="test")
plt.legend(loc="upper right") #显示图中的标签
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.savefig("ROC.jpg")
plt.show()
#将变量名换为中文便于理解
states={
'Unnamed: 0':'id',
'SeriousDlqin2yrs':'好坏客户',
'RevolvingUtilizationOfUnsecuredLines':'可用额度比值', #无担保放款循环利用比值
'age':'年龄',
'NumberOfTime30-59DaysPastDueNotWorse':'逾期30-59天笔数',
'DebtRatio':'负债率',
'MonthlyIncome':'月收入',
'NumberOfOpenCreditLinesAndLoans':'信贷数量',
'NumberOfTimes90DaysLate':'逾期90天笔数',
'NumberRealEstateLoansOrLines':'固定资产贷款量',
'NumberOfTime60-89DaysPastDueNotWorse':'逾期60-89天笔数',
'NumberOfDependents':'家属数量'
} #创建字典
train.rename(columns=states,inplace=True)
train.head()
#求两个变量的缺失比
print("月收入缺失比:{:.2%}".format(train['月收入'].isnull().sum()/train.shape[0]))
print("家属数量缺失比:{:.2%}".format(train['家属数量'].isnull().sum()/train.shape[0]))
#用均值填补月收入缺失值
train['月收入']=train['月收入'].fillna(train['月收入'].mean())
#删除存在缺失值的样本
train=train.dropna()
train.info()
#分别去掉各变量的异常值
train=train[train['逾期30-59天笔数']<80]
train=train[train['逾期60-89天笔数']<80]
train=train[train['逾期60-89天笔数']<80]
train=train[train['逾期90天笔数']<80]
train=train[train['固定资产贷款量']<50]
train=train[train['家属数量']<15]
#去掉重复样本
train.drop_duplicates(inplace=True)
train.describe()
# 设置matplotlib正常显示中文和负号
matplotlib.rcParams['font.sans-serif']=['SimHei'] # 用黑体显示中文
matplotlib.rcParams['axes.unicode_minus']=False # 正常显示负号
train.hist(figsize=(20,15))
#对于不能采用最优分段的变量采用等深分段
def funqcut(y,x,n):
cut1=pd.qcut(x.rank(method='first'),n) #进行等深分箱,分组
data=pd.DataFrame({"x":x,"y":y,"cut1":cut1})
cutbad=data.groupby(cut1).y.sum() #求分组下的坏客户数
cutgood=data.groupby(cut1).y.count()-cutbad #求分组下好客户数
bad=data.y.sum() #求总的坏客户数
good=data.y.count()-bad #求总的好客户数
woe=np.log((cutbad/bad)/(cutgood/good)) #求各分组的woe
iv=(cutbad/bad-cutgood/good)*woe #求各分组的iv
cut=pd.DataFrame({"坏客户数":cutbad,"好客户数":cutgood,"woe":woe,"iv":iv})
print(cut)
return cut#返回表格和对应分组列表
#funqcut(train['好坏客户'],train['年龄'],6).reset_index()
x1=funqcut(train['好坏客户'],train['可用额度比值'],5).reset_index()
x2=funqcut(train['好坏客户'],train['年龄'],12).reset_index()
x4=funqcut(train['好坏客户'],train['负债率'],4).reset_index()
x5=funqcut(train['好坏客户'],train['月收入'],5).reset_index()
x6=funqcut(train['好坏客户'],train['信贷数量'],6).reset_index()
'''qcut按照等频方式分箱,且要求分位点处的取值唯一。当有多个元素有相同的分位点处取值时,就会报错
添加.rank(method=‘first’),相同取值元素的rank不同pd.qcut(df['a'].rank(method='first'), 10) '''
x3=funqcut(train['好坏客户'],train['逾期30-59天笔数'],5).reset_index()
x7=funqcut(train['好坏客户'],train['逾期90天笔数'],5).reset_index()
x8=funqcut(train['好坏客户'],train['固定资产贷款量'],5).reset_index()
x9=funqcut(train['好坏客户'],train['逾期60-89天笔数'],5).reset_index()
x10=funqcut(train['好坏客户'],train['家属数量'],5).reset_index()
#利用sklearn.metrics计算ROC和AUC值
from sklearn.metrics import roc_curve, auc #导入函数
proba_y=model.predict_proba(test_x) #预测概率predict_proba:
'''返回的是一个n行k列的数组,第i行第j列上的数值是模型预测第i个预测样本的标签为j的概率,此时每一行的和应该等于1。'''
fpr,tpr,threshold=roc_curve(test_y,proba_y[:,1]) #计算threshold阈值,tpr真正例率,fpr假正例率,大于阈值的视为1即坏客户
roc_auc=auc(fpr,tpr) #计算AUC值
plt.plot(fpr,tpr,'b',label= 'AUC= %0.2f' % roc_auc) #生成roc曲线
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()
print(roc_auc)
'''KS
plt.plot(dataks.iloc[:,2],dataks['fpr'],label='fpr')
plt.plot(dataks.iloc[:,2],dataks['tpr'],label='tpr')
plt.xlim([0,1])
plt.legend(loc='upper left')
plt.show()
ks=max(tpr-fpr)
print("ks值为:",ks)
corr = df.iloc[:,1:].corr() #计算各变量的相关性系数
xticks = list(corr.index) #x轴标签
yticks = list(corr.index) #y轴标签
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr,annot=True,cmap="rainbow",ax=ax1,linewidths=.5,annot_kws={'size':9,'weight':'bold', 'color':'k'})
ax1.set_xticklabels(xticks, rotation=30, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()
#好坏客户比率
rate=df['好坏客户'].sum()/(df['好坏客户'].count()-df['好坏客户'].sum())
#定义woe计算函数
def get_woe_data(cut):
grouped=df['好坏客户'].groupby(cut,as_index = True).value_counts()
woe=np.log(pd.DataFrame(grouped).unstack().iloc[:,1]/pd.DataFrame(grouped).unstack().iloc[:,0]/rate) #计算每个分组的woe值
return(woe)
cut1_woe=get_woe_data(cut1)
#定义IV值计算函数
def get_IV_data(cut,cut_woe):
grouped=df['好坏客户'].groupby(cut,as_index = True).value_counts()
cut_IV=((pd.DataFrame(grouped).unstack().iloc[:,1]/df['好坏客户'].sum()-pd.DataFrame(grouped).unstack().iloc[:,0]/
(df['好坏客户'].count()-df['好坏客户'].sum()))*cut_woe).sum()
return(cut_IV)
#计算各分组的IV值
cut1_IV=get_IV_data(cut1,cut1_woe)
#各组的IV值可视化
df_IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV],index=df.columns[2:])
df_IV.plot(kind='bar')
for a,b in zip(range(10),df_IV.values):
plt.text(a,b,'%.2f' % b, ha='center',va= 'bottom',fontsize=9)
df_new=df.drop(['负债率', '月收入', '信贷数量','固定资产贷款量', '家属数量','用户ID'],axis=1)
def replace_data(cut,cut_woe):
a=[]
for i in cut.unique():
a.append(i)
a.sort()
for m in range(len(a)):
cut.replace(a[m],cut_woe.values[m],inplace=True)
return cut
#进行替换
df_new['可用额度比值']=replace_data(cut1,cut1_woe)
df_new['年龄']=replace_data(cut2,cut2_woe)
df_new['逾期30-59天笔数']=replace_data(cut3,cut3_woe)
df_new['逾期90天笔数']=replace_data(cut7,cut7_woe)
df_new['逾期60-89天笔数']=replace_data(cut9,cut9_woe)
coe=model.coef_
# 一般行业规则,一般设定当odds为50时,score为600
# Odds翻倍时,score+20
factor = 20/np.log(2)
offset = 600 - 20 * np.log(20)/np.log(2)
#定义变量分数计算函数
def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coe*w*factor,0)
scores.append(score)
return scores
#计算每个变量得分
x1 = get_score(coe[0][0], cut1_woe, factor)
x2 = get_score(coe[0][1], cut2_woe, factor)
x3 = get_score(coe[0][2], cut3_woe, factor)
x7 = get_score(coe[0][3], cut7_woe, factor)
x9 = get_score(coe[0][4], cut9_woe, factor)
#打印输出每个特征对应的分数
print("可用额度比值对应的分数:{}".format(x1))
print("年龄对应的分数:{}".format(x2))
print("逾期30-59天笔数对应的分数:{}".format(x3))
print("逾期90天笔数对应的分数:{}".format(x7))
print("逾期60-89天笔数对应的分数:{}".format(x9))