导入数据,将 标签(Y)和样本(X) 分开来
import pandas as pd
X = pd.read_csv("data.csv")
y = X["status"]
X.drop(["status"],axis=1)
X.head(5)
将数据分成训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30, random_state=2018)
查看数据的信息
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3327 entries, 110 to 1274
Data columns (total 90 columns):
Unnamed: 0 3327 non-null int64
custid 3327 non-null int64
trade_no 3327 non-null object
bank_card_no 3327 non-null object
low_volume_percent 3325 non-null float64
middle_volume_percent 3325 non-null float64
take_amount_in_later_12_month_highest 3327 non-null int64
trans_amount_increase_rate_lately 3325 non-null float64
trans_activity_month 3325 non-null float64
trans_activity_day 3325 non-null float64
transd_mcc 3325 non-null float64
trans_days_interval_filter 3321 non-null float64
trans_days_interval 3325 non-null float64
regional_mobility 3325 non-null float64
student_feature 1208 non-null float64
repayment_capability 3327 non-null int64
is_high_user 3327 non-null int64
number_of_trans_from_2011 3325 non-null float64
first_transaction_time 3325 non-null float64
historical_trans_amount 3327 non-null int64
historical_trans_day 3325 non-null float64
rank_trad_1_month 3325 non-null float64
trans_amount_3_month 3327 non-null int64
avg_consume_less_12_valid_month 3325 non-null float64
abs 3327 non-null int64
top_trans_count_last_1_month 3325 non-null float64
avg_price_last_12_month 3327 non-null int64
avg_price_top_last_12_valid_month 3249 non-null float64
reg_preference_for_trad 3325 non-null object
trans_top_time_last_1_month 3321 non-null float64
trans_top_time_last_6_month 3321 non-null float64
consume_top_time_last_1_month 3321 non-null float64
consume_top_time_last_6_month 3321 non-null float64
cross_consume_count_last_1_month 3037 non-null float64
trans_fail_top_count_enum_last_1_month 3314 non-null float64
trans_fail_top_count_enum_last_6_month 3314 non-null float64
trans_fail_top_count_enum_last_12_month 3314 non-null float64
consume_mini_time_last_1_month 3312 non-null float64
max_cumulative_consume_later_1_month 3327 non-null int64
max_consume_count_later_6_month 3321 non-null float64
railway_consume_count_last_12_month 3319 non-null float64
pawns_auctions_trusts_consume_last_1_month 3327 non-null int64
pawns_auctions_trusts_consume_last_6_month 3327 non-null int64
jewelry_consume_count_last_6_month 3319 non-null float64
status 3327 non-null int64
source 3327 non-null object
first_transaction_day 3325 non-null float64
trans_day_last_12_month 3325 non-null float64
id_name 3125 non-null object
apply_score 3107 non-null float64
apply_credibility 3107 non-null float64
query_org_count 3107 non-null float64
query_finance_count 3107 non-null float64
query_cash_count 3107 non-null float64
query_sum_count 3107 non-null float64
latest_query_time 3107 non-null object
latest_one_month_apply 3107 non-null float64
latest_three_month_apply 3107 non-null float64
latest_six_month_apply 3107 non-null float64
loans_score 3112 non-null float64
loans_credibility_behavior 3112 non-null float64
loans_count 3112 non-null float64
loans_settle_count 3112 non-null float64
loans_overdue_count 3112 non-null float64
loans_org_count_behavior 3112 non-null float64
consfin_org_count_behavior 3112 non-null float64
loans_cash_count 3112 non-null float64
latest_one_month_loan 3112 non-null float64
latest_three_month_loan 3112 non-null float64
latest_six_month_loan 3112 non-null float64
history_suc_fee 3112 non-null float64
history_fail_fee 3112 non-null float64
latest_one_month_suc 3112 non-null float64
latest_one_month_fail 3112 non-null float64
loans_long_time 3112 non-null float64
loans_latest_time 3112 non-null object
loans_credit_limit 3112 non-null float64
loans_credibility_limit 3112 non-null float64
loans_org_count_current 3112 non-null float64
loans_product_count 3112 non-null float64
loans_max_limit 3112 non-null float64
loans_avg_limit 3112 non-null float64
consfin_credit_limit 3112 non-null float64
consfin_credibility 3112 non-null float64
consfin_org_count_current 3112 non-null float64
consfin_product_count 3112 non-null float64
consfin_max_limit 3112 non-null float64
consfin_avg_limit 3112 non-null float64
latest_query_day 3107 non-null float64
loans_latest_day 3112 non-null float64
dtypes: float64(70), int64(13), object(7)
memory usage: 2.3+ MB
将一些没用的特征删除
##将只有一个值的列删除
X_train.drop(["bank_card_no"],axis=1,inplace=True)
##删除身份证号
X_train.drop(["trade_no"],axis=1,inplace=True)
##删除名字
X_train.drop(['id_name'],axis=1,inplace=True)
##删除custid
X_train.drop(['custid'],axis=1,inplace=True)
X_train.drop(['loans_latest_time', 'latest_query_time'], axis=1,inplace=True)
X_train.drop(['source'], axis=1,inplace=True)
把值转为离散型数据
X_train['reg_preference_for_trad'].value_counts()
//
一线城市 2380
三线城市 747
境外 103
二线城市 92
其他城市 3
//
def get_change(dt):
if dt == '一线城市':
return 1
if dt == '二线城市':
return 2
if dt == '三线城市':
return 3
if dt == '境外':
return 4
else :
return 5
X_train['reg_preference_for_trad'] = np.array([get_change(x) for x in X_train['reg_preference_for_trad']])
X_train['reg_preference_for_trad'].value_counts()
//
1 2380
3 747
4 103
2 92
5 5
Name: reg_preference_for_trad, dtype: int64
处理缺失值,将缺失值填充为那一列的均值
cols = X_train.columns
for col in cols:
X_train[col] = X_train[col].fillna(X_train[col].mean())
#方差选择法
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_transform(X_train)
#L1方法
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train,y_train)
model = SelectFromModel(lsvc, prefit=True)
X_new_1 = model.transform(X_train)
X_new_1.shape #特征减少了
#决策树方法
clf = ExtraTreesClassifier().fit(X_train,y_train)
clf.feature_importances_
model = SelectFromModel(clf, prefit=True)
X_new_2 = model.transform(X_train)
X_new_2.shape #特征减少了
#随即森林特征选择 参考别人的
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
forest = RandomForestClassifier(n_estimators=100, random_state=0,n_jobs=1)
forest.fit(X_train,y_train)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]
columns = []
for i in range(x_train.shape[1]):
print("%2d. %-*s %f"%(i+1, 30, names[i], importance[imp_result[i]]))
columns.append(names[i])
columns = columns[0:44]
data =x_train [columns]
def RF_classify(train_data, train_label, test_data, test_label):
forest = RandomForestClassifier(n_estimators=15, random_state=0, n_jobs=-1)
forest.fit(x_train, y_train) #training
ac_train = forest.score(x_test,y_test)
return ac_train
skf = StratifiedKFold(n_splits=10)#10折交叉
ac = []
for train_index, test_index in skf.split(data1,label):
x_train, x_test = data1.iloc[train_index,:], data1.iloc[test_index,:]
y_train, y_test = label.iloc[train_index],label.iloc[test_index]
acc = RF_classify(x_train, y_train, x_test, y_test)
#acc = lightgbm_classify(x_train, y_train, x_test, y_test)
ac.append(acc)
开始调模型
//逻辑回归
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
ltc = LogisticRegression()
ltc.fit(X_train, y_train)
pre = ltc.predict(X_test)
accuracy_score(y_test, pre)
//SVM
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
pre = svc.predict(X_test)
accuracy_score(y_test, pre)
//RF
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
pre = RF.predict(X_test)
accuracy_score(y_test, pre )
//XGboost
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pre = xgb.predict(X_test)
accuracy_score(pre, y_test)
预测结果都差不多 但xgboost和RF的效果会好一点,没有进行调参。