练手项目:loan_prediction问题

机器学习实战 专栏收录该内容
1 篇文章 0 订阅
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

df = pd.read_csv('D:/my_project/Loan_Prediction/LoanPredictionProblem_train.csv')
df.head()
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
Loan_IDGenderMarriedDependentsEducationSelf_EmployedApplicantIncomeCoapplicantIncomeLoanAmountLoan_Amount_TermCredit_HistoryProperty_AreaLoan_Status
0LP001002MaleNo0GraduateNo58490.0NaN360.01.0UrbanY
1LP001003MaleYes1GraduateNo45831508.0128.0360.01.0RuralN
2LP001005MaleYes0GraduateYes30000.066.0360.01.0UrbanY
3LP001006MaleYes0Not GraduateNo25832358.0120.0360.01.0UrbanY
4LP001008MaleNo0GraduateNo60000.0141.0360.01.0UrbanY
# 快速数据探索
df.describe() # get summary of numerical variables
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
ApplicantIncomeCoapplicantIncomeLoanAmountLoan_Amount_TermCredit_History
count614.000000614.000000592.000000600.00000564.000000
mean5403.4592831621.245798146.412162342.000000.842199
std6109.0416732926.24836985.58732565.120410.364878
min150.0000000.0000009.00000012.000000.000000
25%2877.5000000.000000100.000000360.000001.000000
50%3812.5000001188.500000128.000000360.000001.000000
75%5795.0000002297.250000168.000000360.000001.000000
max81000.00000041667.000000700.000000480.000001.000000
# 对于非数值变量(e.g. Property_Area, Credit_History etc.), 观察频率分布是否合理。
df['Property_Area'].value_counts()
Semiurban 233 Urban 202 Rural 179 Name: Property_Area, dtype: int64 # 分布分析
# 研究变量的分布
#1.ApplicantIncome
df['ApplicantIncome'].hist(bins=50)
plt.show()
![png](output_5_0.png)
# boxplot
df.boxplot(column='ApplicantIncome')
plt.show()
![png](output_6_0.png)
# 观察不同教育程度间的申请者收入情况
df.boxplot(column='ApplicantIncome', by='Education')
plt.show()
D:\Anaconda2\lib\site-packages\numpy\core\fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(…) instead return getattr(obj, method)(*args, **kwds) ![png](output_7_1.png)
#2.LoanAmount
df['LoanAmount'].hist(bins=50)
plt.show()
![png](output_8_0.png)
df.boxplot(column='LoanAmount')
plt.show()
![png](output_9_0.png) # 分类变量分析
temp1 = df['Credit_History'].value_counts(ascending=True)
temp2 = df.pivot_table(values='Loan_Status', index=['Credit_History'], aggfunc=lambda x: x.map({'Y': 1, 'N': 0}).mean())
print 'Frequency Table for Credit History:' 
print temp1
print '\nProbility of getting loan for each Credit History class:' 
print temp2
Frequency Table for Credit History: 0.0 89 1.0 475 Name: Credit_History, dtype: int64 Probility of getting loan for each Credit History class: Loan_Status Credit_History 0.0 0.078652 1.0 0.795789
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
temp1.plot(kind='bar', ax=ax1)
ax1.set(title='Probability of getting loan by credit history', xlabel='Credit_History', ylabel='Probability of getting loan')

temp2.plot(kind='bar', ax=ax2)
ax2.set(title='Probability of getting loan by credit history',xlabel='Credit_History', ylabel='')
plt.show()
![png](output_12_0.png)
temp3 = pd.crosstab(df['Credit_History'], df['Loan_Status'])
temp3.plot(kind='bar',stacked=True,color=['red','blue'], grid=False)
#check 缺失值
df.isnull().sum()
Loan_ID 0 Gender 13 Married 3 Dependents 15 Education 0 Self_Employed 32 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 50 Property_Area 0 Loan_Status 0 dtype: int64
# fill missing values
from scipy.stats import mode
df['Gender'].fillna(mode(df['Gender']).mode[0], inplace=True)
df['Married'].fillna(mode(df['Married']).mode[0], inplace=True)
df['Self_Employed'].fillna(mode(df['Self_Employed']).mode[0], inplace=True)
df['Credit_History'].fillna(mode(df['Credit_History']).mode[0], inplace=True)
df.isnull().sum()
D:\Anaconda2\lib\site-packages\scipy\stats\stats.py:253: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored. “values. nan values will be ignored.”, RuntimeWarning) Loan_ID 0 Gender 0 Married 0 Dependents 15 Education 0 Self_Employed 0 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 0 Property_Area 0 Loan_Status 0 dtype: int64
# 根据透视表给LoanAmount填充值
impute_grps = df.pivot_table(values=['LoanAmount'],index=['Gender','Married','Self_Employed'],aggfunc=np.mean)
print impute_grps
LoanAmount Gender Married Self_Employed Female No No 114.691176 Yes 125.800000 Yes No 134.222222 Yes 282.250000 Male No No 129.936937 Yes 180.588235 Yes No 153.882736 Yes 169.395833
for i, row in df.loc[df['LoanAmount'].isnull(),:].iterrows():
    ind = tuple([row['Gender'],row['Married'],row['Self_Employed']])
    df.loc[i,'LoanAmount'] = impute_grps.loc[ind].values[0]
# 检查填充缺失值是否成功
df.isnull().sum()
Loan_ID 0 Gender 0 Married 0 Dependents 15 Education 0 Self_Employed 0 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 0 Loan_Amount_Term 14 Credit_History 0 Property_Area 0 Loan_Status 0 dtype: int64
# 对LoanAmount and ApplicantIncome取对数
df['LoanAmount'] = np.log(df['LoanAmount'])

df['TotalIncome'] = df['ApplicantIncome'] + df['ApplicantIncome']
df['TotalIncome_log'] = np.log(df['TotalIncome'])
# 将非数值型变量转换为数值型
from sklearn.preprocessing import LabelEncoder
mod_var = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in mod_var:
    df[i] = le.fit_transform(df[i])
df.dtypes
print df.head()
Loan_ID Gender Married Dependents Education Self_Employed \ 0 LP001002 1 0 15 0 0 1 LP001003 1 1 16 0 0 2 LP001005 1 1 15 0 1 3 LP001006 1 1 15 1 0 4 LP001008 1 0 15 0 0 ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \ 0 5849 0.0 4.867049 360.0 1 4583 1508.0 4.852030 360.0 2 3000 0.0 4.189655 360.0 3 2583 2358.0 4.787492 360.0 4 6000 0.0 4.948760 360.0 Credit_History Property_Area Loan_Status TotalIncome TotalIncome_log 0 1.0 2 1 11698 9.367173 1 1.0 0 0 9166 9.123256 2 1.0 2 1 6000 8.699515 3 1.0 2 1 5166 8.549854 4 1.0 2 1 12000 9.392662 # 构建预测模型
from sklearn import metrics
from sklearn.model_selection import train_test_split
X = df[['Gender','Married','Dependents','Education','Self_Employed','LoanAmount','Credit_History','Property_Area','TotalIncome_log']]
y = df['Loan_Status']
train_X, test_X, train_y, test_y = train_test_split(X, y)
train_X.shape,train_y.shape,test_X.shape, test_y.shape
((460, 9), (460,), (154, 9), (154,))
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logis = LogisticRegression()
logis.fit(train_X, train_y)
predicted = logis.predict(test_X)
expected = test_y
logis_score_train = logis.score(train_X, train_y)
print('Training score:', logis_score_train)
logis_score_test = logis.score(test_X, test_y)
print('Testing score:', logis_score_test)
(‘Training score:’, 0.80652173913043479) (‘Testing score:’, 0.81818181818181823)
df = df[['Gender','Married','Dependents','Education','Self_Employed','LoanAmount','Credit_History','Property_Area','TotalIncome_log']]
coeff_df = pd.DataFrame(df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df['Correlation'] = pd.Series(logis.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
FeaturesCorrelation
6Property_Area3.079558
1Dependents0.553209
4LoanAmount0.016641
2Education-0.022159
7TotalIncome_log-0.025307
0Married-0.089583
3Self_Employed-0.345603
5Credit_History-0.462348
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(train_X, train_y)
dt_score_train = dt.score(train_X, train_y)
print('Training score: ', dt_score_train)
dt_score_test = dt.score(test_X, test_y)
print('Testing score: ', dt_score_test)
(‘Training score: ‘, 1.0) (‘Testing score: ‘, 0.69480519480519476)
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rft = RandomForestClassifier()
rft.fit(train_X, train_y)
rft_score_train = rft.score(train_X, train_y)
print('Training score: ', rft_score_train)
rft_score_test = rft.score(test_X, test_y)
print('Testing score: ', rft_score_test)
(‘Training score: ‘, 0.98478260869565215) (‘Testing score: ‘, 0.75324675324675328)
#Model comparision
models = pd.DataFrame({
    'model':['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'training_score':[logis_score_train,dt_score_train,rft_score_train],
    'testing_score':[logis_score_test,dt_score_test,rft_score_test]})
models.sort_values(by='testing_score', ascending=True)
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
modeltesting_scoretraining_score
1Decision Tree0.6948051.000000
2Random Forest0.7532470.984783
0Logistic Regression0.8181820.806522
  • 1
    点赞
  • 3
    评论
  • 0
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

打赏
文章很值,打赏犒劳作者一下
相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页

打赏

farmer_chou

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值