离散特征的处理
- 读取数据
- 找到所有离散特征
- 选择一个离散特征进行独热编码
- 采取循环对所有离散特征进行独热编码
- 加上昨天的内容 并且处理所有缺失值
data=pd.read_excel('data.xlsx')
#找到离散变量
discrete_lists=[]
for discrete_features in data.columns:
if data[discrete_features].dtype == 'object':
discrete_lists.append(discrete_features)
#对离散变量进行编码
data = pd.get_dummies(data, columns=discrete_lists, drop_first=True)
data.columns
Index(['Id', 'Annual Income', 'Tax Liens', 'Number of Open Accounts',
'Years of Credit History', 'Maximum Open Credit',
'Number of Credit Problems', 'Months since last delinquent',
'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance',
'Monthly Debt', 'Credit Score', 'Credit Default',
'Home Ownership_Home Mortgage', 'Home Ownership_Own Home',
'Home Ownership_Rent', 'Years in current job_10+ years',
'Years in current job_2 years', 'Years in current job_3 years',
'Years in current job_4 years', 'Years in current job_5 years',
'Years in current job_6 years', 'Years in current job_7 years',
'Years in current job_8 years', 'Years in current job_9 years',
'Years in current job_< 1 year', 'Purpose_buy a car',
'Purpose_buy house', 'Purpose_debt consolidation',
'Purpose_educational expenses', 'Purpose_home improvements',
'Purpose_major purchase', 'Purpose_medical bills', 'Purpose_moving',
'Purpose_other', 'Purpose_renewable energy', 'Purpose_small business',
'Purpose_take a trip', 'Purpose_vacation', 'Purpose_wedding',
'Term_Short Term'],
dtype='object')
#找到所有独热编码后的新特征名
data2=pd.read_excel('data.xlsx')
list_final=[]
for i in data.columns:
if i not in data2.columns:
list_final.append(i)
list_final
['Home Ownership_Home Mortgage',
'Home Ownership_Own Home',
'Home Ownership_Rent',
'Years in current job_10+ years',
'Years in current job_2 years',
'Years in current job_3 years',
'Years in current job_4 years',
'Years in current job_5 years',
'Years in current job_6 years',
'Years in current job_7 years',
'Years in current job_8 years',
'Years in current job_9 years',
'Years in current job_< 1 year',
'Purpose_buy a car',
'Purpose_buy house',
'Purpose_debt consolidation',
'Purpose_educational expenses',
'Purpose_home improvements',
'Purpose_major purchase',
'Purpose_medical bills',
'Purpose_moving',
'Purpose_other',
'Purpose_renewable energy',
'Purpose_small business',
'Purpose_take a trip',
'Purpose_vacation',
'Purpose_wedding',
'Term_Short Term']
#对bool特征进行类型转换
for i in list_final:
data[i]=data[i].astype(int)
data.head()
Id Annual Income Tax Liens Number of Open Accounts Years of Credit History Maximum Open Credit Number of Credit Problems Months since last delinquent Bankruptcies Current Loan Amount ... Purpose_major purchase Purpose_medical bills Purpose_moving Purpose_other Purpose_renewable energy Purpose_small business Purpose_take a trip Purpose_vacation Purpose_wedding Term_Short Term
0 0 482087.0 0 11 26.3 685960 1 NaN 1.0 99999999 ... 0 0 0 0 0 0 0 0 0 1
1 1 1025487.0 0 15 15.3 1181730 0 NaN 0.0 264968 ... 0 0 0 0 0 0 0 0 0 0
2 2 751412.0 0 11 35.0 1182434 0 NaN 0.0 99999999 ... 0 0 0 0 0 0 0 0 0 1
3 3 805068.0 0 8 22.5 147400 1 NaN 1.0 121396 ... 0 0 0 0 0 0 0 0 0 1
4 4 776264.0 0 13 13.6 385836 1 NaN 0.0 125840 ... 0 0 0 0 0 0 0 0 0 1
5 rows × 42 columns
#查看缺失值
data.isnull().sum()
Id 0
Annual Income 1557
Tax Liens 0
Number of Open Accounts 0
Years of Credit History 0
Maximum Open Credit 0
Number of Credit Problems 0
Months since last delinquent 4081
Bankruptcies 14
Current Loan Amount 0
Current Credit Balance 0
Monthly Debt 0
Credit Score 1557
Credit Default 0
Home Ownership_Home Mortgage 0
Home Ownership_Own Home 0
Home Ownership_Rent 0
Years in current job_10+ years 0
Years in current job_2 years 0
Years in current job_3 years 0
Years in current job_4 years 0
Years in current job_5 years 0
Years in current job_6 years 0
Years in current job_7 years 0
Years in current job_8 years 0
Years in current job_9 years 0
Years in current job_< 1 year 0
Purpose_buy a car 0
Purpose_buy house 0
Purpose_debt consolidation 0
Purpose_educational expenses 0
Purpose_home improvements 0
Purpose_major purchase 0
Purpose_medical bills 0
Purpose_moving 0
Purpose_other 0
Purpose_renewable energy 0
Purpose_small business 0
Purpose_take a trip 0
Purpose_vacation 0
Purpose_wedding 0
Term_Short Term 0
dtype: int64
#用均值填补
for i in data.columns:
if data[i].isnull().sum()>0:
mean_value=data[i].mean()
data[i].fillna(mean_value, inplace=True)
data.isnull().sum()
Id 0
Annual Income 0
Tax Liens 0
Number of Open Accounts 0
Years of Credit History 0
Maximum Open Credit 0
Number of Credit Problems 0
Months since last delinquent 0
Bankruptcies 0
Current Loan Amount 0
Current Credit Balance 0
Monthly Debt 0
Credit Score 0
Credit Default 0
Home Ownership_Home Mortgage 0
Home Ownership_Own Home 0
Home Ownership_Rent 0
Years in current job_10+ years 0
Years in current job_2 years 0
Years in current job_3 years 0
Years in current job_4 years 0
Years in current job_5 years 0
Years in current job_6 years 0
Years in current job_7 years 0
Years in current job_8 years 0
Years in current job_9 years 0
Years in current job_< 1 year 0
Purpose_buy a car 0
Purpose_buy house 0
Purpose_debt consolidation 0
Purpose_educational expenses 0
Purpose_home improvements 0
Purpose_major purchase 0
Purpose_medical bills 0
Purpose_moving 0
Purpose_other 0
Purpose_renewable energy 0
Purpose_small business 0
Purpose_take a trip 0
Purpose_vacation 0
Purpose_wedding 0
Term_Short Term 0
dtype: int64