# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('D:/my_project/Loan_Prediction/LoanPredictionProblem_train.csv')
df.head()
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
# 快速数据探索
Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
1 | LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
2 | LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
3 | LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
4 | LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
df.describe() # get summary of numerical variables
.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }
ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | |
---|---|---|---|---|---|
count | 614.000000 | 614.000000 | 592.000000 | 600.00000 | 564.000000 |
mean | 5403.459283 | 1621.245798 | 146.412162 | 342.00000 | 0.842199 |
std | 6109.041673 | 2926.248369 | 85.587325 | 65.12041 | 0.364878 |
min | 150.000000 | 0.000000 | 9.000000 | 12.00000 | 0.000000 |
25% | 2877.500000 | 0.000000 | 100.000000 | 360.00000 | 1.000000 |
50% | 3812.500000 | 1188.500000 | 128.000000 | 360.00000 | 1.000000 |
75% | 5795.000000 | 2297.250000 | 168.000000 | 360.00000 | 1.000000 |
max | 81000.000000 | 41667.000000 | 700.000000 | 480.00000 | 1.000000 |
# 对于非数值变量(e.g. Property_Area, Credit_History etc.), 观察频率分布是否合理。
df['Property_Area'].value_counts()
Semiurban 233 Urban 202 Rural 179 Name: Property_Area, dtype: int64 # 分布分析
# 研究变量的分布
<