import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif as MIC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
def reda_data_to_csv():
df = pd.read_csv(r'E:\AIprojectspace\datawhale_dataanly\data\data.csv', encoding='gbk')
print(df.head())
print(df.shape)
print(df.columns)
return df
def fea_categorical_check():
delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no', 'id_name', 'latest_query_time', 'source', 'loans_latest_time',
'first_transaction_time','student_feature']
return delete
def dropTable(df,delete):
df = df.drop(delete, axis=1)
print(df.info())
return df
def look_data(df):
nan_rate_data = pd.DataFrame((df.shape[0] - df.count()) / df.shape[0]) # 查看缺失值比例
print(nan_rate_data[nan_rate_data.iloc[:, 0] > 0.6])
print(df.info())
return df
def data_miss_value(df):
for i in range(df.shape[1]):
feature = df.iloc[:, i].values.reshape(-1, 1) # sklearn中特征矩阵必须是二维
imp_mode = SimpleImputer(strategy='most_frequent')
df.iloc[:, i] = imp_mode.fit_transform(feature)
print(df.info())
return df
def catalay_data(df):
df['reg_preference_for_trad'] = OrdinalEncoder().fit_transform(df['reg_preference_for_trad'].values.reshape(-1, 1))
# 查找标签值对应的索引
for i in range(df.shape[1]):
if df.columns[i] == 'status':
print(i)
print(df.columns[38])
y = df.iloc[:,38]
x = df.drop(['status'],axis=1)
return x,y
def feature_project(x,y):
# 1.特征工程——方差过滤¶
X_fsvar = VarianceThreshold().fit_transform(x) # 实例化,不填写参数默认方差为0
print(X_fsvar.shape) # 方差过滤阈值为0时没有删除任何特征
# 2.特征工程——互信息法¶
# 互信息法,它返回每个特征与目标之间的互信息量的估计,这个估计在[0,1]之间取值,0表示独立,1表示两个
# 变量完全相关
result = MIC(x, y)
print(result.shape)
print((result == 0).sum())
delete = []
for i in range(79):
if result[i] == 0:
delete.append(i)
print(delete)
x_ = x.drop(x.iloc[:, delete], axis=1)
print(x_.shape)
## 样本不平衡处理
print(y[y == 1].count() / y[y == 0].count())
sm = SMOTE(random_state=520)
x, y = sm.fit_sample(x_, y)
n_sample_ = x.shape[0]
print(n_sample_)
print(pd.Series(y).value_counts())
n_1_sample = pd.Series(y).value_counts()[1]
n_0_sample = pd.Series(y).value_counts()[0]
print('样本个数:{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample_, n_1_sample / n_sample_, n_0_sample / n_sample_))
return x,y
def data_split(x,y):
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.3, random_state=420)
return Xtrain,Xtest,Ytrain,Ytest
def main():
"""1.1:读取和预览数据"""
df = reda_data_to_csv()
look_data(df)
delete_name = fea_categorical_check()
"""1.2:无关特征删除"""
df = dropTable(df, delete_name)
"""1.3:缺失值处理"""
df = data_miss_value(df)
"""1.4:处理分类型特征"""
x,y = catalay_data(df)
"""1.5: 特征工程"""
x,y = feature_project(x,y)
"""1.6: 数据切分"""
Xtrain, Xtest, Ytrain, Ytest = data_split(x,y)
if __name__ == '__main__':
main()