import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
alldata = pd.read_csv(r'D:\data\datawhale\data.csv', encoding='gbk')
alldata = alldata.drop(columns='Unnamed: 0')
数据类型的分析
non_num_features = list(alldata.dtypes.loc[alldata.dtypes=='object',].index)# object type
num_features = list(alldata.dtypes.loc[alldata.dtypes!='object',].index)# non object type
num_features.remove('custid')#编号没有意义
num_features.remove('status')
数据类型转换
把object 类型变为数值类型,可以考虑分箱+woe转换,也可以用labelencoder转换,此处采用后一种
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
for i in tqdm(non_num_features):
lbl = LabelEncoder()
alldata[i] = lbl.fit_transform(alldata[i].astype(str))
把负值变为空缺值
alldata['latest_query_day'].replace({-2:np.nan, -1:np.nan}, inplace=True)
alldata['loans_latest_day'].replace({-2:np.nan, -1:np.nan}, inplace=True)
缺失值处理
- 对于缺失比例超过0.2的列,新增判断是否为空的列
- 对于缺失值超过0.4的,删掉
- 对于缺失值小于0.4 , 大于0.1的,用监督方法补全,这里用decision tree
- 对于缺失值小于0.1的,数值类型的用mean填充,非数值类型的用mode填充
all_feature = num_features + non_num_features
for i in all_feature:
if (alldata[i].isnull().sum()/alldata.shape[0]) >= 0.2:
alldata[i + '_isnull']=0
alldata.loc[alldata[i].isnull() ,i + '_isnull'] = 1
non_num_features.append(i + '_isnull')
if alldata[i].isnull().sum()/alldata.shape[0] > 0.4:
if i in non_num_features:
non_num_features.remove(i)
else:
num_features.remove(i)
elif ((alldata[i].isnull().sum()/alldata.shape[0]) <= 0.4) & ((alldata[i].isnull().sum()/alldata.shape[0]) > 0):
if (alldata[i].dtypes=='object') | (i in non_num_features):
alldata[i].fillna(alldata[i].mode()[0], inplace=True)
else:
alldata[i].fillna(alldata[i].mean(), inplace=True)
无关特征删除
使用卡方检验来挑选变量
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
model1 = SelectKBest(chi2, k=2)#选择k个最佳特征
model1.fit_transform(alldata[non_num_features+num_features], alldata['status'])#iris.data是特征数据,iris.target是标签数据,该函数可以选择出k个特征