import pandas as pd
import numpy as np
t = pd. read_csv( 'C:/Users/user/Desktop/train.csv' )
T = t. set_index( 'Loan_ID' )
T. head( 10 )
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status Loan_ID LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.0 360.0 0.0 Semiurban N LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N
mask= ( T[ 'Education' ] == 'Not Graduate' ) & ( T[ 'Loan_Status' ] == 'Y' ) & ( T[ 'Gender' ] == 'Female' )
T. loc[ mask, [ 'Gender' , 'Education' , 'Loan_Status' ] ]
Gender Education Loan_Status Loan_ID LP001155 Female Not Graduate Y LP001669 Female Not Graduate Y LP001692 Female Not Graduate Y LP001908 Female Not Graduate Y LP002300 Female Not Graduate Y LP002314 Female Not Graduate Y LP002407 Female Not Graduate Y LP002489 Female Not Graduate Y LP002502 Female Not Graduate Y LP002534 Female Not Graduate Y LP002582 Female Not Graduate Y LP002731 Female Not Graduate Y LP002757 Female Not Graduate Y LP002917 Female Not Graduate Y
def num_missing ( x) :
return sum ( x. isnull( ) )
T. apply ( num_missing, axis= 0 )
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 22
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
T. apply ( num_missing, axis= 1 ) [ : 10 ]
Loan_ID
LP001002 1
LP001003 0
LP001005 0
LP001006 0
LP001008 0
LP001011 0
LP001013 0
LP001014 0
LP001018 0
LP001020 0
dtype: int64
T[ 'Gender' ] . fillna( T[ 'Gender' ] . mode( ) . iloc[ 0 ] , inplace= True )
T[ 'Married' ] . fillna( T[ 'Married' ] . mode( ) . iloc[ 0 ] , inplace= True )
T[ 'Self_Employed' ] . fillna( T[ 'Self_Employed' ] . mode( ) . iloc[ 0 ] , inplace= True )
pd. crosstab( T[ 'Credit_History' ] , T[ 'Loan_Status' ] , margins= True )
Loan_Status N Y All Credit_History 0.0 82 7 89 1.0 97 378 475 All 179 385 564
prop_rates = pd. DataFrame( [ 1000 , 5000 , 12000 ] , index= [ 'Rural' , 'Semiurban' , 'Urban' ] , columns= [ 'rates' ] )
prop_rates
rates Rural 1000 Semiurban 5000 Urban 12000
T1= T. merge( right= prop_rates, how= 'inner' , left_on= 'Property_Area' , right_index= True , sort= False )
T1
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status rates Loan_ID LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y 12000 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y 12000 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y 12000 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y 12000 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y 12000 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y 12000 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y 12000 LP001024 Male Yes 2 Graduate No 3200 700.0 70.0 360.0 1.0 Urban Y 12000 LP001027 Male Yes 2 Graduate No 2500 1840.0 109.0 360.0 1.0 Urban Y 12000 LP001028 Male Yes 2 Graduate No 3073 8106.0 200.0 360.0 1.0 Urban Y 12000 LP001030 Male Yes 2 Graduate No 1299 1086.0 17.0 120.0 1.0 Urban Y 12000 LP001032 Male No 0 Graduate No 4950 0.0 125.0 360.0 1.0 Urban Y 12000 LP001034 Male No 1 Not Graduate No 3596 0.0 100.0 240.0 NaN Urban Y 12000 LP001036 Female No 0 Graduate No 3510 0.0 76.0 360.0 0.0 Urban N 12000 LP001041 Male Yes 0 Graduate No 2600 3500.0 115.0 NaN 1.0 Urban Y 12000 LP001043 Male Yes 0 Not Graduate No 7660 0.0 104.0 360.0 0.0 Urban N 12000 LP001046 Male Yes 1 Graduate No 5955 5625.0 315.0 360.0 1.0 Urban Y 12000 LP001073 Male Yes 2 Not Graduate No 4226 1040.0 110.0 360.0 1.0 Urban Y 12000 LP001086 Male No 0 Not Graduate No 1442 0.0 35.0 360.0 1.0 Urban N 12000 LP001091 Male Yes 1 Graduate No 4166 3369.0 201.0 360.0 NaN Urban N 12000 LP001095 Male No 0 Graduate No 3167 0.0 74.0 360.0 1.0 Urban N 12000 LP001106 Male Yes 0 Graduate No 2275 2067.0 NaN 360.0 1.0 Urban Y 12000 LP001109 Male Yes 0 Graduate No 1828 1330.0 100.0 NaN 0.0 Urban N 12000 LP001114 Male No 0 Graduate No 4166 7210.0 184.0 360.0 1.0 Urban Y 12000 LP001119 Male No 0 Graduate No 3600 0.0 80.0 360.0 1.0 Urban N 12000 LP001120 Male No 0 Graduate No 1800 1213.0 47.0 360.0 1.0 Urban Y 12000 LP001123 Male Yes 0 Graduate No 2400 0.0 75.0 360.0 NaN Urban Y 12000 LP001136 Male Yes 0 Not Graduate Yes 4695 0.0 96.0 NaN 1.0 Urban Y 12000 LP001137 Female No 0 Graduate No 3410 0.0 88.0 NaN 1.0 Urban Y 12000 LP001138 Male Yes 1 Graduate No 5649 0.0 44.0 360.0 1.0 Urban Y 12000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... LP002729 Male No 1 Graduate No 11250 0.0 196.0 360.0 NaN Semiurban N 5000 LP002738 Male No 2 Graduate No 3617 0.0 107.0 360.0 1.0 Semiurban Y 5000 LP002741 Female Yes 1 Graduate No 4608 2845.0 140.0 180.0 1.0 Semiurban Y 5000 LP002743 Female No 0 Graduate No 2138 0.0 99.0 360.0 0.0 Semiurban N 5000 LP002753 Female No 1 Graduate No 3652 0.0 95.0 360.0 1.0 Semiurban Y 5000 LP002757 Female Yes 0 Not Graduate No 3017 663.0 102.0 360.0 NaN Semiurban Y 5000 LP002768 Male No 0 Not Graduate No 3358 0.0 80.0 36.0 1.0 Semiurban N 5000 LP002776 Female No 0 Graduate No 5000 0.0 103.0 360.0 0.0 Semiurban N 5000 LP002792 Male Yes 1 Graduate No 5468 1032.0 26.0 360.0 1.0 Semiurban Y 5000 LP002795 Male Yes 3+ Graduate Yes 10139 0.0 260.0 360.0 1.0 Semiurban Y 5000 LP002798 Male Yes 0 Graduate No 3887 2669.0 162.0 360.0 1.0 Semiurban Y 5000 LP002804 Female Yes 0 Graduate No 4180 2306.0 182.0 360.0 1.0 Semiurban Y 5000 LP002807 Male Yes 2 Not Graduate No 3675 242.0 108.0 360.0 1.0 Semiurban Y 5000 LP002813 Female Yes 1 Graduate Yes 19484 0.0 600.0 360.0 1.0 Semiurban Y 5000 LP002821 Male No 0 Not Graduate Yes 5800 0.0 132.0 360.0 1.0 Semiurban Y 5000 LP002862 Male Yes 2 Not Graduate No 6125 1625.0 187.0 480.0 1.0 Semiurban N 5000 LP002863 Male Yes 3+ Graduate No 6406 0.0 150.0 360.0 1.0 Semiurban N 5000 LP002872 Male Yes 0 Graduate No 3087 2210.0 136.0 360.0 0.0 Semiurban N 5000 LP002892 Male Yes 2 Graduate No 6540 0.0 205.0 360.0 1.0 Semiurban Y 5000 LP002894 Female Yes 0 Graduate No 3166 0.0 36.0 360.0 1.0 Semiurban Y 5000 LP002917 Female No 0 Not Graduate No 2165 0.0 70.0 360.0 1.0 Semiurban Y 5000 LP002925 Male No 0 Graduate No 4750 0.0 94.0 360.0 1.0 Semiurban Y 5000 LP002926 Male Yes 2 Graduate Yes 2726 0.0 106.0 360.0 0.0 Semiurban N 5000 LP002928 Male Yes 0 Graduate No 3000 3416.0 56.0 180.0 1.0 Semiurban Y 5000 LP002931 Male Yes 2 Graduate Yes 6000 0.0 205.0 240.0 1.0 Semiurban N 5000 LP002933 Male No 3+ Graduate Yes 9357 0.0 292.0 360.0 1.0 Semiurban Y 5000 LP002943 Male No NaN Graduate No 2987 0.0 88.0 360.0 0.0 Semiurban N 5000 LP002959 Female Yes 1 Graduate No 12000 0.0 496.0 360.0 1.0 Semiurban Y 5000 LP002961 Male Yes 1 Graduate No 3400 2500.0 173.0 360.0 1.0 Semiurban Y 5000 LP002990 Female No 0 Graduate Yes 4583 0.0 133.0 360.0 0.0 Semiurban N 5000
614 rows × 13 columns
T1. pivot_table( values= "Credit_History" , index= [ "Property_Area" , "rates" ] , aggfunc= len )
Credit_History Property_Area rates Rural 1000 179.0 Semiurban 5000 233.0 Urban 12000 202.0
T1. groupby( [ 'Property_Area' , 'rates' ] ) [ 'Credit_History' ] . value_counts( )
Property_Area rates Credit_History
Rural 1000 1.0 137
0.0 28
Semiurban 5000 1.0 187
0.0 30
Urban 12000 1.0 151
0.0 31
Name: Credit_History, dtype: int64
paixu= T[ [ 'ApplicantIncome' , 'CoapplicantIncome' ] ]
paixu. sort_values( by= [ 'ApplicantIncome' , 'CoapplicantIncome' ] , inplace= True , ascending= False )
paixu. head( 10 )
C:\Users\user\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
ApplicantIncome CoapplicantIncome Loan_ID LP002317 81000 0.0 LP002101 63337 0.0 LP001585 51763 0.0 LP001536 39999 0.0 LP001640 39147 4750.0 LP002422 37719 0.0 LP001637 33846 0.0 LP001448 23803 0.0 LP002624 20833 6667.0 LP001922 20667 0.0
T2 = T. iloc[ : 5 , : 2 ]
T2
Gender Married Loan_ID LP001002 Male No LP001003 Male Yes LP001005 Male Yes LP001006 Male Yes LP001008 Male No
T3 = T. iloc[ : 8 , 2 : 4 ]
T3
Dependents Education Loan_ID LP001002 0 Graduate LP001003 1 Graduate LP001005 0 Graduate LP001006 0 Not Graduate LP001008 0 Graduate LP001011 2 Graduate LP001013 0 Not Graduate LP001014 3+ Graduate
T2. merge( T3, how= 'inner' , left_index= True , right_index= True )
Gender Married Dependents Education Loan_ID LP001002 Male No 0 Graduate LP001003 Male Yes 1 Graduate LP001005 Male Yes 0 Graduate LP001006 Male Yes 0 Not Graduate LP001008 Male No 0 Graduate
T3. merge( T2, how= 'inner' , left_index= True , right_index= True )
Dependents Education Gender Married Loan_ID LP001002 0 Graduate Male No LP001003 1 Graduate Male Yes LP001005 0 Graduate Male Yes LP001006 0 Not Graduate Male Yes LP001008 0 Graduate Male No
T2. merge( T3, how= 'right' , left_index= True , right_index= True )
Gender Married Dependents Education Loan_ID LP001002 Male No 0 Graduate LP001003 Male Yes 1 Graduate LP001005 Male Yes 0 Graduate LP001006 Male Yes 0 Not Graduate LP001008 Male No 0 Graduate LP001011 NaN NaN 2 Graduate LP001013 NaN NaN 0 Not Graduate LP001014 NaN NaN 3+ Graduate
T2. merge( T3, how= 'outer' , left_index= True , right_index= True )
Gender Married Dependents Education Loan_ID LP001002 Male No 0 Graduate LP001003 Male Yes 1 Graduate LP001005 Male Yes 0 Graduate LP001006 Male Yes 0 Not Graduate LP001008 Male No 0 Graduate LP001011 NaN NaN 2 Graduate LP001013 NaN NaN 0 Not Graduate LP001014 NaN NaN 3+ Graduate
pd. concat( [ T2, T3] )
Dependents Education Gender Married Loan_ID LP001002 NaN NaN Male No LP001003 NaN NaN Male Yes LP001005 NaN NaN Male Yes LP001006 NaN NaN Male Yes LP001008 NaN NaN Male No LP001002 0 Graduate NaN NaN LP001003 1 Graduate NaN NaN LP001005 0 Graduate NaN NaN LP001006 0 Not Graduate NaN NaN LP001008 0 Graduate NaN NaN LP001011 2 Graduate NaN NaN LP001013 0 Not Graduate NaN NaN LP001014 3+ Graduate NaN NaN
pd. concat( [ T2, T3] , axis= 0 )
Dependents Education Gender Married Loan_ID LP001002 NaN NaN Male No LP001003 NaN NaN Male Yes LP001005 NaN NaN Male Yes LP001006 NaN NaN Male Yes LP001008 NaN NaN Male No LP001002 0 Graduate NaN NaN LP001003 1 Graduate NaN NaN LP001005 0 Graduate NaN NaN LP001006 0 Not Graduate NaN NaN LP001008 0 Graduate NaN NaN LP001011 2 Graduate NaN NaN LP001013 0 Not Graduate NaN NaN LP001014 3+ Graduate NaN NaN
pd. concat( [ T2, T3] , axis= 1 )
Gender Married Dependents Education LP001002 Male No 0 Graduate LP001003 Male Yes 1 Graduate LP001005 Male Yes 0 Graduate LP001006 Male Yes 0 Not Graduate LP001008 Male No 0 Graduate LP001011 NaN NaN 2 Graduate LP001013 NaN NaN 0 Not Graduate LP001014 NaN NaN 3+ Graduate
pd. concat( [ T2, T3] , axis= 1 , join= 'inner' )
Gender Married Dependents Education Loan_ID LP001002 Male No 0 Graduate LP001003 Male Yes 1 Graduate LP001005 Male Yes 0 Graduate LP001006 Male Yes 0 Not Graduate LP001008 Male No 0 Graduate
paixu = T. sort_values( [ 'ApplicantIncome' , 'CoapplicantIncome' ] , ascending= False )
paixu[ [ 'ApplicantIncome' , 'CoapplicantIncome' ] ] . head( 10 )
ApplicantIncome CoapplicantIncome Loan_ID LP002317 81000 0.0 LP002101 63337 0.0 LP001585 51763 0.0 LP001536 39999 0.0 LP001640 39147 4750.0 LP002422 37719 0.0 LP001637 33846 0.0 LP001448 23803 0.0 LP002624 20833 6667.0 LP001922 20667 0.0
cut_points= [ 90 , 140 , 190 ]
break_points= [ T[ 'LoanAmount' ] . min ( ) ] + cut_points+ [ T[ 'LoanAmount' ] . max ( ) ]
print ( break_points)
labels= [ 'low' , 'medium' , 'high' , 'very_high' ]
T[ 'LoanAmount_Bin' ] = pd. cut( T[ 'LoanAmount' ] , bins= break_points, right= False , labels= [ 'low' , 'medium' , 'high' , 'very_high' ] )
[9.0, 90, 140, 190, 700.0]
pd. value_counts( T[ 'LoanAmount_Bin' ] , sort= False )
low 98
medium 266
high 136
very_high 91
Name: LoanAmount_Bin, dtype: int64
T[ 'Loan_Status_Coded' ] = T[ 'Loan_Status' ] . replace( { 'N' : 0 , 'Y' : 1 } )
pd. get_dummies( T[ 'LoanAmount_Bin' ] , prefix= 'LoanAmount' )
LoanAmount_low LoanAmount_medium LoanAmount_high LoanAmount_very_high Loan_ID LP001002 0 0 0 0 LP001003 0 1 0 0 LP001005 1 0 0 0 LP001006 0 1 0 0 LP001008 0 0 1 0 LP001011 0 0 0 1 LP001013 0 1 0 0 LP001014 0 0 1 0 LP001018 0 0 1 0 LP001020 0 0 0 1 LP001024 1 0 0 0 LP001027 0 1 0 0 LP001028 0 0 0 1 LP001029 0 1 0 0 LP001030 1 0 0 0 LP001032 0 1 0 0 LP001034 0 1 0 0 LP001036 1 0 0 0 LP001038 0 1 0 0 LP001041 0 1 0 0 LP001043 0 1 0 0 LP001046 0 0 0 1 LP001047 0 1 0 0 LP001050 0 1 0 0 LP001052 0 0 1 0 LP001066 0 0 0 1 LP001068 0 1 0 0 LP001073 0 1 0 0 LP001086 1 0 0 0 LP001087 0 1 0 0 ... ... ... ... ... LP002911 0 0 1 0 LP002912 0 0 1 0 LP002916 0 1 0 0 LP002917 1 0 0 0 LP002925 0 1 0 0 LP002926 0 1 0 0 LP002928 1 0 0 0 LP002931 0 0 0 1 LP002933 0 0 0 1 LP002936 0 0 1 0 LP002938 0 0 0 1 LP002940 0 1 0 0 LP002941 0 0 1 0 LP002943 1 0 0 0 LP002945 0 0 1 0 LP002948 0 0 0 1 LP002949 0 0 0 1 LP002950 0 0 1 0 LP002953 0 1 0 0 LP002958 0 0 1 0 LP002959 0 0 0 1 LP002960 0 0 0 0 LP002961 0 0 1 0 LP002964 0 0 1 0 LP002974 0 1 0 0 LP002978 1 0 0 0 LP002979 1 0 0 0 LP002983 0 0 0 1 LP002984 0 0 1 0 LP002990 0 1 0 0
614 rows × 4 columns