import pandas as pd
import numpy as np
t = pd. read_csv( 'C:/Users/user/Desktop/train.csv' )
T = t. set_index( 'Loan_ID' )
T. head( 10 )
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status Loan_ID LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.0 360.0 0.0 Semiurban N LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N
mask= ( T[ 'Education' ] == 'Not Graduate' ) & ( T[ 'Loan_Status' ] == 'Y' ) & ( T[ 'Gender' ] == 'Female' )
T. loc[ mask, [ 'Gender' , 'Education' , 'Loan_Status' ] ]
Gender Education Loan_Status Loan_ID LP001155 Female Not Graduate Y LP001669 Female Not Graduate Y LP001692 Female Not Graduate Y LP001908 Female Not Graduate Y LP002300 Female Not Graduate Y LP002314 Female Not Graduate Y LP002407 Female Not Graduate Y LP002489 Female Not Graduate Y LP002502 Female Not Graduate Y LP002534 Female Not Graduate Y LP002582 Female Not Graduate Y LP002731 Female Not Graduate Y LP002757 Female Not Graduate Y LP002917 Female Not Graduate Y
def num_missing ( x) :
return sum ( x. isnull( ) )
T. apply ( num_missing, axis= 0 )
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 22
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
dtype: int64
T. apply ( num_missing, axis= 1 ) [ : 10 ]
Loan_ID
LP001002 1
LP001003 0
LP001005 0
LP001006 0
LP001008 0
LP001011 0
LP001013 0
LP001014 0
LP001018 0
LP001020 0
dtype: int64
T[ 'Gender' ] . fillna( T[ 'Gender' ] . mode( ) . iloc[ 0 ] , inplace= True )
T[ 'Married' ] . fillna( T[ 'Married' ] . mode( ) . iloc[ 0 ] , inplace= True )
T[ 'Self_Employed' ] . fillna( T[ 'Self_Employed' ] . mode( ) . iloc[ 0 ] , inplace= True )
pd. crosstab( T[ 'Credit_History' ] , T[ 'Loan_Status' ] , margins= True )
Loan_Status N Y All Credit_History 0.0 82 7 89 1.0 97 378 475 All 179 385 564
prop_rates = pd. DataFrame( [ 1000 , 5000 , 12000 ] , index= [ 'Rural' , 'Semiurban' , 'Urban' ] , columns= [ 'rates' ] )
prop_rates
rates Rural 1000 Semiurban 5000 Urban 12000
T1= T. merge( right= prop_rates, how= 'inner' , left_on= 'Property_Area' , right_index= True , sort= False )
T1
Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status rates Loan_ID LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y 12000 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y 12000 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y 12000 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y 12000 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y 12000 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y 12000 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y 12000 LP001024 Male Yes 2 Graduate No 3200 700.0 70.0 360.0 1.0 Urban Y 12000 LP001027 Male Yes 2 Graduate No 2500 1840.0 109.0 360.0 1.0 Urban Y 12000 LP001028 Male Yes 2 Graduate No 3073 8106.0 200.0 360.0 1.0 Urban Y 12000 LP001030 Male Yes 2 Graduate No 1299 1086.0 17.0 120.0 1.0 Urban Y 12000 LP001032 Male No 0 Graduate No 4950 0.0 125.0 360.0 1.0 Urban Y 12000 LP001034 Male No 1 Not Graduate No 3596 0.0 100.0 240.0 NaN Urban Y 12000 LP001036 Female No 0 Graduate No 3510 0.0 76.0 360.0 0.0 Urban N 12000 LP001041 Male Yes 0 Graduate No 2600 3500.0 115.0 NaN 1.0 Urban Y 12000 LP001043 Male Yes 0 Not Graduate No 7660 0.0 104.0 360.0 0.0 Urban N 12000 LP001046 Male Yes 1 Graduate No 5955 5625.0 315.0 360.0 1.0 Urban Y 12000 LP001073 Male Yes 2 Not Graduate No 4226 1040.0 110.0 360.0 1.0 Urban Y 12000 LP001086 Male No 0 Not Graduate No 1442 0.0 35.0 360.0 1.0 Urban N 12000 LP001091 Male Yes 1 Graduate No 4166 3369.0 201.0 360.0 NaN Urban N 12000 LP001095 Male No 0 Graduate No 3167 0.0 74.0 360.0 1.0 Urban N 12000 LP001106 Male Yes 0 Graduate No 2275 2067.0 NaN 360.0 1.0 Urban Y 12000 LP001109 Male Yes 0 Graduate No 1828 1330.0 100.0 NaN 0.0 Urban N 12000 LP001114 Male No 0 Graduate No 4166 7210.0 184.0 360.0 1.0 Urban Y 12000 LP001119 Male No 0 Graduate No 3600 0.0 80.0 360.0 1.0 Urban N 12000 LP001120 Male No 0 Graduate No 1800 1213.0 47.0 360.0 1.0 Urban Y 12000 LP001123 Male Yes 0 Graduate No 2400 0.0 75.0 360.0 NaN Urban Y 12000 LP001136 Male Yes 0 Not Graduate Yes 4695 0.0 96.0 NaN 1.0 Urban Y 12000 LP001137 Female No 0 Graduate No 3410 0.0 88.0 NaN 1.0 Urban Y 12000 LP001138 Male Yes 1 Graduate No 5649 0.0 44.0 360.0 1.0 Urban Y 12000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... LP002729 Male No 1 Graduate No 11250 0.0 196.0 360.0 NaN Semiurban N 5000 LP002738 Male No 2 Graduate No 3617 0.0 107.0 360.0 1.0 Semiurban Y 5000 LP002741 Female Yes 1 Graduate No 4608 2845.0 140.0 180.0 1.0 Semiurban Y 5000 LP002743 Female No 0 Graduate No 2138 0.0 99.0 360.0 0.0 Semiurban N 5000 LP002753 Female No 1 Graduate No 3652 0.0 95.0 360.0 1.0 Semiurban Y 5000 LP002757 Female Yes 0 Not Graduate No 3017 663.0 102.0 360.0 NaN Semiurban Y 5000 LP002768 Male No 0 Not Graduate No 3358 0.0 80.0 36.0 1.0 Semiurban N 5000 LP002776 Female No 0 Graduate No 5000 0.0 103.0 360.0 0.0 Semiurban N 5000 LP002792 Male Yes 1 Graduate No 5468 1032.0 26.0 360.0 1.0 Semiurban Y 5000 LP002795 Male Yes 3+ Graduate Yes 10139 0.0 260.0 360.0 1.0 Semiurban Y 5000 LP002798 Male Yes 0 Graduate No 3887 2669.0 162.0 360.0 1.0 Semiurban Y 5000 LP002804 Female Yes 0 Graduate No 4180 2306.0 182.0 360.0 1.0 Semiurban Y 5000 LP002807 Male Yes 2 Not Graduate No 3675 242.0 108.0 360.0 1.0 Semiurban Y 5000 LP002813 Female Yes 1 Graduate Yes 19484 0.0 600.0 360.0 1.0 Semiurban Y 5000 LP002821 Male No 0 Not Graduate Yes 5800 0.0 132.0 360.0 1.0 Semiurban Y 5000 LP002862 Male Yes 2 Not Graduate No 6125 1625.0 187.0 480.0 1.0 Semiurban N 5000 LP002863 Male Yes 3+ Graduate No 6406 0.0 150.0 360.0 1.0 Semiurban N 5000 LP002872 Male Yes 0 Graduate No 3087 2210.0 136.0 360.0 0.0 Semiurban N 5000 LP002892 Male Yes 2 Graduate No 6540 0.0 205.0 360.0 1.0 Semiurban Y 5000 LP002894 Female Yes 0 Graduate No 3166 0.0 36.0 360.0 1.0 Semiurban Y 5000 LP002917 Female No 0 Not Graduate No 2165 0.0 70.0 360.0 1.0 Semiurban Y 5000 LP002925 Male No 0 Graduate No 4750 0.0 94.0 360.0 1.0 Semiurban Y 5000 LP002926 Male Yes 2 Graduate Yes 2726 0.0 106.0 360.0 0.0 Semiurban N 5000 LP002928 Male Yes 0 Graduate No 3000 3416.0 56.0 180.0 1.0 Semiurban Y 5000 LP002931 Male Yes 2 Graduate Yes 6000 0.0 205.0 240.0 1.0 Semiurban N 5000 LP002933 Male No 3+ Graduate Yes 9357 0.0 292.0 360.0 1.0 Semiurban Y 5000 LP002943 Male No NaN Graduate No 2987 0.0 88.0 360.0 0.0 Semiurban N 5000 LP002959 Female Yes 1 Graduate No 12000 0.0 496.0 360.0 1.0 Semiurban Y 5000 LP002961 Male Yes 1 Graduate No 3400 2500.0 173.0 360.0 1.0 Semiurban Y 5000 LP002990 Female No 0 Graduate Yes 4583 0.0 133.0 360.0 0.0 Semiurban N 5000
614 rows × 13 columns
impute_grps = T1. pivot_table( values= [ "Credit_History" ] , index= [ "Property_Area" , "rates" ] , aggfunc= np. mean)
ind = tuple ( [ row[ 'Property_Area' ] , row[ 'rates' ] ] )
T1. loc[ "Credit_History" ] = impute_grps. loc[ ind] . values[ 0 ]
grps = T1. groupby( [ "Property_Area" , "rates" ] )
grps[ "Credit_History" ] . mean( )
Property_Area rates
0.8617511520737328 0.861751 0.861751
Rural 1000.000000 0.830303
Semiurban 5000.000000 0.861751
Urban 12000.000000 0.829670
Name: Credit_History, dtype: float64
T2= T1. groupby( [ "Property_Area" , "rates" ] ) [ "Credit_History" ] . count( ) . reset_index( )
T2. set_index( "Property_Area" )
rates Credit_History Property_Area Rural 1000 165 Semiurban 5000 217 Urban 12000 182