python风控建模实战lendingClub(博主录制,catboost,lightgbm建模,2K超清分辨率)
## 1. Data Lending Club 2016年Q3数据:https://www.lendingclub.com/info/download-data.action
参考:http://kldavenport.com/lending-club-data-analysis-revisted-with-python/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline
df = pd.read_csv("./LoanStats_2016Q3.csv",skiprows=1,low_memory=False)
df.info()
df.head(3)
id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
…
sec_app_earliest_cr_line
sec_app_inq_last_6mths
sec_app_mort_acc
sec_app_open_acc
sec_app_revol_util
sec_app_open_il_6m
sec_app_num_rev_accts
sec_app_chargeoff_within_12_mths
sec_app_collections_12_mths_ex_med
sec_app_mths_since_last_major_derog
0
NaN
NaN
15000.0
15000.0
15000.0
36 months
13.99%
512.60
C
C3
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
1
NaN
NaN
2600.0
2600.0
2600.0
36 months
8.99%
82.67
B
B1
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2
NaN
NaN
32200.0
32200.0
32200.0
60 months
21.49%
880.02
D
D5
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
3 rows × 122 columns
## 2. Keep what we need
# .ix[row slice, column slice]
df.ix[:4,:7]
id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
0
NaN
NaN
15000.0
15000.0
15000.0
36 months
13.99%
1
NaN
NaN
2600.0
2600.0
2600.0
36 months
8.99%
2
NaN
NaN
32200.0
32200.0
32200.0
60 months
21.49%
3
NaN
NaN
10000.0
10000.0
10000.0
36 months
11.49%
4
NaN
NaN
6000.0
6000.0
6000.0
36 months
13.49%
df.drop('id',1,inplace=True)
df.drop('member_id',1,inplace=True)
df.int_rate = pd.Series(df.int_rate).str.replace('%', '').astype(float)
df.ix[:4,:7]
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
0
15000.0
15000.0
15000.0
36 months
13.99
512.60
C
1
2600.0
2600.0
2600.0
36 months
8.99
82.67
B
2
32200.0
32200.0
32200.0
60 months
21.49
880.02
D
3
10000.0
10000.0
10000.0
36 months
11.49
329.72
B
4
6000.0
6000.0
6000.0
36 months
13.49
203.59
C
### Loan Amount Requested Verus the Funded Amount
print (df.loan_amnt != df.funded_amnt).value_counts()
False 99120 True 4 dtype: int64
df.query('loan_amnt != funded_amnt').head(5)
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
…
sec_app_earliest_cr_line
sec_app_inq_last_6mths
sec_app_mort_acc
sec_app_open_acc
sec_app_revol_util
sec_app_open_il_6m
sec_app_num_rev_accts
sec_app_chargeoff_within_12_mths
sec_app_collections_12_mths_ex_med
sec_app_mths_since_last_major_derog
99120
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
99121
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
99122
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
99123
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
…
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
4 rows × 120 columns
df.dropna(axis=0, how='all',inplace=True)
df.info()
df.dropna(axis=1, how='all',inplace=True)
df.info()
df.ix[:5,8:15]
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
loan_status
0
Fiscal Director
2 years
RENT
55000.0
Not Verified
Sep-16
Current
1
Loaner Coordinator
3 years
RENT
35000.0
Source Verified
Sep-16
Fully Paid
2
warehouse/supervisor
10+ years
MORTGAGE
65000.0
Not Verified
Sep-16
Fully Paid
3
Teacher
10+ years
OWN
55900.0
Not Verified
Sep-16
Current
4
SERVICE MGR
5 years
RENT
33000.0
Not Verified
Sep-16
Current
5
General Manager
10+ years
MORTGAGE
109000.0
Source Verified
Sep-16
Current
### emp_title: employment title
print df.emp_title.value_counts().head()
print df.emp_title.value_counts().tail()
df.emp_title.unique().shape
Teacher 1931 Manager 1701 Owner 990 Supervisor 785 Driver 756 Name: emp_title, dtype: int64 Agent Services Representative 1 Operator Bridge Tunnel 1 Reg Medical Assistant/Referral Spec. 1 Home Health Care 1 rounds cook 1 Name: emp_title, dtype: int64 (37421,)
df.drop(['emp_title'],1, inplace=True)
df.ix[:5,8:15]
emp_length
home_ownership
annual_inc
verification_status
issue_d
loan_status
pymnt_plan
0
2 years
RENT
55000.0
Not Verified
Sep-16
Current
n
1
3 years
RENT
35000.0
Source Verified
Sep-16
Fully Paid
n
2
10+ years
MORTGAGE
65000.0
Not Verified
Sep-16
Fully Paid
n
3
10+ years
OWN
55900.0
Not Verified
Sep-16
Current
n
4
5 years
RENT
33000.0
Not Verified
Sep-16
Current
n
5
10+ years
MORTGAGE
109000.0
Source Verified
Sep-16
Current
n
### emp_length: employment length
df.emp_length.value_counts()
10+ years 34219 2 years 9066 3 years 7925
df.replace('n/a', np.nan,inplace=True)
df.emp_length.fillna(value=0,inplace=True)
df['emp_length'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True) df['emp_length'] = df['emp_length'].astype(int)
df.emp_length.value_counts()
10 34219 1 14095 2 9066 3 7925 5 6170 4 6022 0 5922 6 4406 8 4168 9 3922 7 3205 Name: emp_length, dtype: int64 ### verification status:”Indicates if income was verified by LC, not verified, or if the income source was verified”
df.verification_status.value_counts()
Source Verified 40781 Verified 31356 Not Verified 26983 Name: verification_status, dtype: int64 ### Target: Loan Statuses
df.info()
df.columns
Index([u’loan_amnt’, u’funded_amnt’, u’funded_amnt_inv’, u’term’, u’int_rate’, u’installment’, u’grade’, u’sub_grade’, u’emp_length’, u’home_ownership’, … u’num_tl_90g_dpd_24m’, u’num_tl_op_past_12m’, u’pct_tl_nvr_dlq’, u’percent_bc_gt_75’, u’pub_rec_bankruptcies’, u’tax_liens’, u’tot_hi_cred_lim’, u’total_bal_ex_mort’, u’total_bc_limit’, u’total_il_high_credit_limit’], dtype=’object’, length=107)
pd.unique(df['loan_status'].values.ravel())
array([‘Current’, ‘Fully Paid’, ‘Late (31-120 days)’, ‘Charged Off’, ‘Late (16-30 days)’, ‘In Grace Period’, ‘Default’], dtype=object)
for col in df.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(df[col].unique())) )
Column term has 2 unique instances Column grade has 7 unique instances Column sub_grade has 35 unique instances Column home_ownership has 4 unique instances Column verification_status has 3 unique instances Column issue_d has 3 unique instances Column loan_status has 7 unique instances Column pymnt_plan has 2 unique instances Column desc has 6 unique instances Column purpose has 13 unique instances Column title has 13 unique instances Column zip_code has 873 unique instances Column addr_state has 50 unique instances Column earliest_cr_line has 614 unique instances Column revol_util has 1087 unique instances Column initial_list_status has 2 unique instances Column last_pymnt_d has 13