import pandas as pd
df= pd. read_csv( "../Desktop/train.csv" )
df. head( 10 )
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status 0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y 1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N 2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y 3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y 4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y 5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y 6 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y 7 LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.0 360.0 0.0 Semiurban N 8 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y 9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N
df1= df[ ( df[ 'Education' ] == 'Not Graduate' ) & ( df[ 'Loan_Status' ] == 'Y' ) & ( df[ 'Gender' ] == 'Female' ) ] [ [ 'Gender' , 'Education' , 'Loan_Status' ] ]
df1. head( )
Gender Education Loan_Status 50 Female Not Graduate Y 197 Female Not Graduate Y 205 Female Not Graduate Y 279 Female Not Graduate Y 403 Female Not Graduate Y
df. loc[ : , 'new_col' ] = df[ 'Loan_Status' ]
df
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status new_col 0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y Y 1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N N 2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y Y 3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y Y 4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y Y 5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y Y 6 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.0 360.0 1.0 Urban Y Y 7 LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.0 360.0 0.0 Semiurban N N 8 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.0 360.0 1.0 Urban Y Y 9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N N 10 LP001024 Male Yes 2 Graduate No 3200 700.0 70.0 360.0 1.0 Urban Y Y 11 LP001027 Male Yes 2 Graduate NaN 2500 1840.0 109.0 360.0 1.0 Urban Y Y 12 LP001028 Male Yes 2 Graduate No 3073 8106.0 200.0 360.0 1.0 Urban Y Y 13 LP001029 Male No 0 Graduate No 1853 2840.0 114.0 360.0 1.0 Rural N N 14 LP001030 Male Yes 2 Graduate No 1299 1086.0 17.0 120.0 1.0 Urban Y Y 15 LP001032 Male No 0 Graduate No 4950 0.0 125.0 360.0 1.0 Urban Y Y 16 LP001034 Male No 1 Not Graduate No 3596 0.0 100.0 240.0 NaN Urban Y Y 17 LP001036 Female No 0 Graduate No 3510 0.0 76.0 360.0 0.0 Urban N N 18 LP001038 Male Yes 0 Not Graduate No 4887 0.0 133.0 360.0 1.0 Rural N N 19 LP001041 Male Yes 0 Graduate NaN 2600 3500.0 115.0 NaN 1.0 Urban Y Y 20 LP001043 Male Yes 0 Not Graduate No 7660 0.0 104.0 360.0 0.0 Urban N N 21 LP001046 Male Yes 1 Graduate No 5955 5625.0 315.0 360.0 1.0 Urban Y Y 22 LP001047 Male Yes 0 Not Graduate No 2600 1911.0 116.0 360.0 0.0 Semiurban N N 23 LP001050 NaN Yes 2 Not Graduate No 3365 1917.0 112.0 360.0 0.0 Rural N N 24 LP001052 Male Yes 1 Graduate NaN 3717 2925.0 151.0 360.0 NaN Semiurban N N 25 LP001066 Male Yes 0 Graduate Yes 9560 0.0 191.0 360.0 1.0 Semiurban Y Y 26 LP001068 Male Yes 0 Graduate No 2799 2253.0 122.0 360.0 1.0 Semiurban Y Y 27 LP001073 Male Yes 2 Not Graduate No 4226 1040.0 110.0 360.0 1.0 Urban Y Y 28 LP001086 Male No 0 Not Graduate No 1442 0.0 35.0 360.0 1.0 Urban N N 29 LP001087 Female No 2 Graduate NaN 3750 2083.0 120.0 360.0 1.0 Semiurban Y Y ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 584 LP002911 Male Yes 1 Graduate No 2787 1917.0 146.0 360.0 0.0 Rural N N 585 LP002912 Male Yes 1 Graduate No 4283 3000.0 172.0 84.0 1.0 Rural N N 586 LP002916 Male Yes 0 Graduate No 2297 1522.0 104.0 360.0 1.0 Urban Y Y 587 LP002917 Female No 0 Not Graduate No 2165 0.0 70.0 360.0 1.0 Semiurban Y Y 588 LP002925 NaN No 0 Graduate No 4750 0.0 94.0 360.0 1.0 Semiurban Y Y 589 LP002926 Male Yes 2 Graduate Yes 2726 0.0 106.0 360.0 0.0 Semiurban N N 590 LP002928 Male Yes 0 Graduate No 3000 3416.0 56.0 180.0 1.0 Semiurban Y Y 591 LP002931 Male Yes 2 Graduate Yes 6000 0.0 205.0 240.0 1.0 Semiurban N N 592 LP002933 NaN No 3+ Graduate Yes 9357 0.0 292.0 360.0 1.0 Semiurban Y Y 593 LP002936 Male Yes 0 Graduate No 3859 3300.0 142.0 180.0 1.0 Rural Y Y 594 LP002938 Male Yes 0 Graduate Yes 16120 0.0 260.0 360.0 1.0 Urban Y Y 595 LP002940 Male No 0 Not Graduate No 3833 0.0 110.0 360.0 1.0 Rural Y Y 596 LP002941 Male Yes 2 Not Graduate Yes 6383 1000.0 187.0 360.0 1.0 Rural N N 597 LP002943 Male No NaN Graduate No 2987 0.0 88.0 360.0 0.0 Semiurban N N 598 LP002945 Male Yes 0 Graduate Yes 9963 0.0 180.0 360.0 1.0 Rural Y Y 599 LP002948 Male Yes 2 Graduate No 5780 0.0 192.0 360.0 1.0 Urban Y Y 600 LP002949 Female No 3+ Graduate NaN 416 41667.0 350.0 180.0 NaN Urban N N 601 LP002950 Male Yes 0 Not Graduate NaN 2894 2792.0 155.0 360.0 1.0 Rural Y Y 602 LP002953 Male Yes 3+ Graduate No 5703 0.0 128.0 360.0 1.0 Urban Y Y 603 LP002958 Male No 0 Graduate No 3676 4301.0 172.0 360.0 1.0 Rural Y Y 604 LP002959 Female Yes 1 Graduate No 12000 0.0 496.0 360.0 1.0 Semiurban Y Y 605 LP002960 Male Yes 0 Not Graduate No 2400 3800.0 NaN 180.0 1.0 Urban N N 606 LP002961 Male Yes 1 Graduate No 3400 2500.0 173.0 360.0 1.0 Semiurban Y Y 607 LP002964 Male Yes 2 Not Graduate No 3987 1411.0 157.0 360.0 1.0 Rural Y Y 608 LP002974 Male Yes 0 Graduate No 3232 1950.0 108.0 360.0 1.0 Rural Y Y 609 LP002978 Female No 0 Graduate No 2900 0.0 71.0 360.0 1.0 Rural Y Y 610 LP002979 Male Yes 3+ Graduate No 4106 0.0 40.0 180.0 1.0 Rural Y Y 611 LP002983 Male Yes 1 Graduate No 8072 240.0 253.0 360.0 1.0 Urban Y Y 612 LP002984 Male Yes 2 Graduate No 7583 0.0 187.0 360.0 1.0 Urban Y Y 613 LP002990 Female No 0 Graduate Yes 4583 0.0 133.0 360.0 0.0 Semiurban N N
614 rows × 14 columns
df. Education== 'Not Graduate'
0 False
1 False
2 False
3 True
4 False
5 False
6 True
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 True
17 False
18 True
19 False
20 True
21 False
22 True
23 True
24 False
25 False
26 False
27 True
28 True
29 False
...
584 False
585 False
586 False
587 True
588 False
589 False
590 False
591 False
592 False
593 False
594 False
595 True
596 True
597 False
598 False
599 False
600 False
601 True
602 False
603 False
604 False
605 True
606 False
607 True
608 False
609 False
610 False
611 False
612 False
613 False
Name: Education, Length: 614, dtype: bool
def num_missing ( x) :
return sum ( x. isnull( ) )
sum ( df[ 'Gender' ] . isnull( ) )
13
df. apply ( num_missing, axis= 0 )
Loan_ID 0
Gender 13
Married 3
Dependents 15
Education 0
Self_Employed 32
ApplicantIncome 0
CoapplicantIncome 0
LoanAmount 22
Loan_Amount_Term 14
Credit_History 50
Property_Area 0
Loan_Status 0
new_col 0
dtype: int64
df. apply ( num_missing, axis= 1 )
0 1
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 1
12 0
13 0
14 0
15 0
16 1
17 0
18 0
19 2
20 0
21 0
22 0
23 1
24 2
25 0
26 0
27 0
28 0
29 1
..
584 0
585 0
586 0
587 0
588 1
589 0
590 0
591 0
592 1
593 0
594 0
595 0
596 0
597 1
598 0
599 0
600 2
601 1
602 0
603 0
604 0
605 1
606 0
607 0
608 0
609 0
610 0
611 0
612 0
613 0
Length: 614, dtype: int64
df2= df. apply ( num_missing, axis= 1 )
df2. head( 10 )
0 1
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
dtype: int64
df[ 'Gender' ] . mode( ) . iloc[ 0 ]
'Male'
df[ 'Gender' ] . fillna( df[ 'Gender' ] . mode( ) . iloc[ 0 ] , inplace= True )
sum ( df[ 'Gender' ] . isnull( ) )
0
import numpy as np
df. pivot_table( values= [ "LoanAmount" ] , index= [ "Gender" , "Married" , "Self_Employed" ] , aggfunc= np. mean)
LoanAmount Gender Married Self_Employed Female No No 110.596774 Yes 125.800000 Yes No 135.480000 Yes 282.250000 Male No No 127.500000 Yes 180.588235 Yes No 153.982699 Yes 169.395833
for i , row in df. loc[ data[ "LoanAmount" ] . isnull( ) , : ] . iterrows( ) :
ind= tuple ( [ row[ 'Gender' ] , row[ 'Married' ] , row[ 'Self_Employed' ] ] )
df. loc[ i, "LoanAmount" ] = impute_grps. loc{ ind} . values[ 0 ]
File "<ipython-input-35-99e2725d767b>", line 3
df.loc[i,"LoanAmount"]=impute_grps.loc{ind}.values[0]
^
SyntaxError: invalid syntax
grps= df. groupby( [ "Gender" , "Married" , "Self_Employed" ] )
grps[ "LoanAmount" ] . mean( )
Gender Married Self_Employed
Female No No 110.596774
Yes 125.800000
Yes No 135.480000
Yes 282.250000
Male No No 127.500000
Yes 180.588235
Yes No 153.982699
Yes 169.395833
Name: LoanAmount, dtype: float64