import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inlineimport xgboost as xgb
df = pd.read_csv('./data/LoanStats3a_2.csv',low_memory=False,skiprows=1)
df.isnull().any()# 检查 null
df.head()
df.shape
df = df.iloc[:,2:111]# 删掉很多空的列
empty_cols =[i for i inrange(45,72)]# 删除更多的列
df = df.drop(df.columns[empty_cols],axis=1)
df.shape
df = df[(df['loan_status']=="Fully Paid")|(df['loan_status']=="Charged Off")]
df['loan_status']= df['loan_status'].map({'Fully Paid':0,'Charged Off':1})
df=df.dropna(axis=1)#340000 is minimum number of non-NA values
df