「二分类算法」提供银行精准营销解决方案 代码存档

import mglearn
from numpy import int64
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示

# 字段说明
#
# NO    字段名称    数据类型    字段描述
# 1    ID    Int    客户唯一标识
# 2    age    Int    客户年龄
# 3    job    String    客户的职业
# 4    marital    String    婚姻状况
# 5    education    String    受教育水平
# 6    default    String    是否有违约记录
# 7    balance    Int    每年账户的平均余额
# 8    housing    String    是否有住房贷款
# 9    loan    String    是否有个人贷款
# 10    contact    String    与客户联系的沟通方式
# 11    day    Int    最后一次联系的时间(几号)
# 12    month    String    最后一次联系的时间(月份)
# 13    duration    Int    最后一次联系的交流时长
# 14    campaign    Int    在本次活动中,与该客户交流过的次数
# 15    pdays    Int    距离上次活动最后一次联系该客户,过去了多久(999表示没有联系过)
# 16    previous    Int    在本次活动之前,与该客户交流过的次数
# 17    poutcome    String    上一次活动的结果
# 18    y    Int    预测客户是否会订购定期存款业务
from sklearn.tree import DecisionTreeClassifier

data_train = pd.read_csv('train_set.csv')
data_test = pd.read_csv('test_set.csv')
ids_test = data_test['ID']

print(data_train.shape[0])

# data_train['cppv']=data_train['campaign']+data_train['previous']
# data_test['cppv']=data_test['campaign']+data_test['previous']
# data_train.drop(['campaign','previous'], axis=1, inplace=True)
# data_test.drop(['campaign','previous'], axis=1, inplace=True)

# Rela_grouped=data_train.groupby(['cp'])
# Rela_Survival_Rate=(Rela_grouped.sum()/Rela_grouped.count())['y']
# Rela_count=Rela_grouped.count()['y']
#
# ax1=Rela_count.plot(kind='bar',color='g')
# ax2=ax1.twinx()
# ax2.plot(Rela_Survival_Rate.values,color='r')
# ax1.set_xlabel('Relatives')
# ax1.set_ylabel('Number')
# ax2.set_ylabel('Survival Rate')
# plt.title('Survival Rate by Relatives')
# plt.grid(True,linestyle='-',color='0.7')
# plt.show()

# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'day', bins=30)
# plt.show()


print("数值处理1:标签指标one-hot编码处理")


data_train.drop(['ID'], axis=1, inplace=True)
data_test.drop(['ID'], axis=1, inplace=True)

dummy = pd.get_dummies(data_train[['month','job','marital','education','default','housing','loan','contact','poutcome']])
dummyTest = pd.get_dummies(data_test[['month','job','marital','education','default','housing','loan','contact','poutcome']])
data_train = pd.concat([dummy, data_train], axis=1)
data_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
data_test = pd.concat([dummyTest, data_test], axis=1)
data_test.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

data_train['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_train['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_train['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_train['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_train['day'].replace([1,24,31],0,inplace=True)

data_test['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_test['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_test['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_test['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_test['day'].replace([1,24,31],0,inplace=True)


# data_train['month1'] = data_train.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
#
# data_test['month1'] = data_test.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
# #
data_train.drop(['month'], inplace=True, axis=1)
data_test.drop(['month'], inplace=True, axis=1)
# data_train.drop(['day','job_management','marital_single'], axis=1, inplace=True)
# data_test.drop(['day','job_management','marital_single'], axis=1, inplace=True)


# data_train['month'].replace(['may'],4,inplace=True)
# data_train['month'].replace(['aug','jul','apr'],3,inplace=True)
# data_train['month'].replace(['jun','feb','nov','oct'],2,inplace=True)
# data_train['month'].replace(['sep','mar'],1,inplace=True)
# data_train['month'].replace(['jan','dec'],0,inplace=True)

# 多删特征
# data_train.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
# data_test.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

#default、housing、loan都是2分类的指标,删除其中一个即可
# data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
# data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)


################################
########### 数据整理 ###########
################################

data_train['pdays'].replace(-1,9999,inplace=True)
data_test['pdays'].replace(-1,9999,inplace=True)
print("数值处理2:pdays将-1替换为999")
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)


# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'pdays', bins=20)
# plt.show()
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)

y = data_train['y']
X = data_train[data_train.columns[: -1]]
# # X.info()
# pdays的平均值先前看到是45,而-1距离45很近,距离max值854很远,故还是需要将所有的-1替换为999
#数据预处理:
#数据中pdays=-1表示从未联络过,替换为999



#对方差较大的数据指标进行变换,MinMaxScaler或者StandardScaler
print("数值处理3:数值指标Scaler变换")
scaler = MinMaxScaler()
# numerical = ['age','balance', 'duration', 'pdays', 'previous']
# X[numerical] = scaler.fit_transform(X[numerical])
# data_test[numerical] = scaler.fit_transform(data_test[numerical])
print(data_test.shape)
X = scaler.fit_transform(X)
data_test = scaler.fit_transform(data_test)

# tsvd = TruncatedSVD(n_components=46)
# data_test = tsvd.fit_transform(data_test)
#数据分割,用于测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1)
# X_train = tsvd.fit_transform(X_train)
# X_test = tsvd.fit_transform(X_test)
# print(X_train.shape)

#增加二项式特征
# polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)
# #增加二项式特征,仅仅是交叉特征
# polynomial_interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
# X_train = polynomial_interaction.fit_transform(X_train)
# X_test = polynomial_interaction.fit_transform(X_test)
# data_test = polynomial_interaction.fit_transform(data_test)
# print('after Polynomial:',X_train.shape)
#
# # #保留99%的信息,进行朱成本分析
# pca = PCA(n_components=100,whiten=True)
# X_train = pca.fit_transform(X_train)
# X_test = pca.fit_transform(X_test)
# data_test = pca.fit_transform(data_test)
# print('after PCA:',X_train.shape)

# #卡方分类筛选
# selector = SelectKBest(f_classif,k=300)
# X_train = selector.fit_transform(X_train,y_train)
# X_test = selector.fit_transform(X_test,y_test)
# print('after SelectKBest:',X_train.shape)

# print(X_train['pdays'])

################################
########### 性能计算 ###########
################################


# print('决策树,分数不理想')
# clf = DecisionTreeClassifier(random_state=11)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
print('随机森林,0.919203')
clf = RandomForestClassifier(n_estimators=90, random_state=0,oob_score=True,n_jobs=-1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
print(clf.score(X_test, y_test))
y_predprob = clf.predict_proba(X_test)
y_predprob = y_predprob[:, 1]
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#穷举随机森林的最佳参数,答案:90
# param_test1 ={'n_estimators':range(10,100,5)}
# gsearch1= GridSearchCV(estimator =RandomForestClassifier(min_samples_split=100,
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10),
#                        param_grid =param_test1,scoring='roc_auc',cv=5)
# gsearch1.fit(X_train, y_train)
# print(gsearch1.best_params_)
# y_predprob = gsearch1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gsearch1.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# print('逻辑回归,0.904655,0.915316')
# # print(X_train)
# #clf = Lasso(alpha=0.5)
# clf = LogisticRegression(random_state=0,solver='newton-cg',class_weight='balanced',penalty='l2',n_jobs=-1)
# # solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
# clf.fit(X_train, y_train)
# # clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# # print(classification_report(y_test, predictions))
# # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# raletion = pd.DataFrame({"columns":list(data_train.columns)[0:-1], "coef":list(clf.coef_.T)})
# print('相关性:',raletion)

# #穷举逻辑回归的最佳参数,答案:
# # best C : LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
# #                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
# #                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
# #                    random_state=None, solver='warn', tol=0.0001, verbose=0,
# #                    warm_start=False)
# penalty = ['l1','l2']
# C=np.logspace(0,4,10)
# hyperparameters = dict(C=C,penalty=penalty)
# gridsearch = GridSearchCV(clf,hyperparameters,cv=5,verbose=0)
# best_clf= gridsearch.fit(X_train, y_train)
# print('best C :',best_clf.best_estimator_)
# print(gridsearch.best_params_)
# y_predprob = gridsearch.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gridsearch.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('AdaBoost')
# clf = AdaBoostClassifier(n_estimators=60, random_state=90)
#
# clf.fit(X_train, y_train)
# predictionsByadaBoost = clf.predict(X_test)
# print(classification_report(y_test, predictionsByadaBoost))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# pred = clf.predict_proba(X_test)
# dataPred = pd.DataFrame(pred, columns=['pred0', 'pred'])
# dataPred.drop('pred0', axis=1, inplace=True)
# print(dataPred)
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# predictions_train =  clf.predict(X_train)
# y_predprob_train = clf.predict_proba(X_train)
# y_predprob_train = y_predprob_train[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train))
# #
#
#
# # #
# print('神经网络')
# # ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
# # ‘sgd’ refers to stochastic gradient descent.
# # ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
# clf = MLPClassifier(solver='adam', hidden_layer_sizes=(80,80),
#                     random_state=1)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print('神经网络 end')
# # #导出结果
ID = list(range(25318,36170))
submission = pd.DataFrame(ID)
submission.rename(columns = {0: 'ID'}, inplace = True)
# 将pred_y从array转化成DataFrame
y_predprob_test = clf.predict_proba(data_test)
y_predprob_test = y_predprob_test[:, 1]
y_predprob_DataFrame = pd.DataFrame(y_predprob_test)
submission['pred'] =y_predprob_DataFrame
submission.to_csv('Result.csv', index = False)

#为防止过拟合而减半步长,最大迭代次数加倍
# gbm1 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=10000, max_depth=7, min_samples_leaf=70,
#                                   min_samples_split=1300, subsample=0.8, random_state=10)
# gbm1.fit(X_train, y_train)
#
# y_pred = gbm1.predict(X_test)
# y_predprob = gbm1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('KNN近邻,分数不理想')
# clf = KNeighborsClassifier(n_neighbors=5)
# clf.fit(X_train,y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]

# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('SVM支持向量机')
# clf = SVC(kernel='rbf',C=1,gamma='auto',probability=True).fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#朴素贝叶斯
# print('朴素贝叶斯')
# clf = GaussianNB()
#
# clf_sigmoid = CalibratedClassifierCV(clf,cv=5)
# clf_sigmoid.fit(X_train,y_train)
# predictions = clf_sigmoid.predict(X_test)
# y_predprob = clf_sigmoid.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

################################
# AdaBoost选为第一次使用的算法,提交数据
################################
# print('AdaBoost')
# adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11)
# adaBoost.fit(X_train, y_train)
#
# age_null = pd.isnull(data_test['age'])
# data_null = data_test[age_null == True]
# # print(data_null)
#
# id = data_test["ID"]
# print(id)
# X_test.drop(['ID'], axis=1, inplace=True)
#
# submission = pd.DataFrame({
#         "ID": id
#     })
#
# submission[['ID']].astype(int)
# # submission[['ID']] = submission[['ID']].astype(int)
# submission.to_csv('submission.csv', index=False)

# data_test.dropna(inplace=True)
# print(np.isnan(data_test).any())
# submission.replace(np.nan, 0, inplace=True)


# predictionsByadaBoost = adaBoost.predict_proba(X_test)
#
# submission = pd.DataFrame({
#         "ID": id,
#         "pred": predictionsByadaBoost
#     })
# submission.to_csv('submission.csv', index=False)

 

第一次提交,没做什么特征工程,分数还不太理想

0.9157894736842105
Accuracy : 0.9158
AUC Score (Test): 0.932477

 

过程分析

from numpy import int64
from sklearn import metrics from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import seaborn as sns from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest,chi2,f_classif from sklearn.metrics import roc_auc_score data_train = pd.read_csv('/home/kesci/input/firstdata1587/train_set.csv') data_test = pd.read_csv('/home/kesci/input/firstdata1587/test_set.csv') data_train.describe() 
Out[4]:
 IDagebalancedaydurationcampaignpdayspreviousy
count25317.00000025317.00000025317.00000025317.00000025317.00000025317.00000025317.00000025317.00000025317.000000
mean12659.00000040.9353791357.55508215.835289257.7323932.77205040.2487660.5917370.116957
std7308.53271910.6342892999.8228118.319480256.9751513.136097100.2135412.5683130.321375
min1.00000018.000000-8019.0000001.0000000.0000001.000000-1.0000000.0000000.000000
25%6330.00000033.00000073.0000008.000000103.0000001.000000-1.0000000.0000000.000000
50%12659.00000039.000000448.00000016.000000181.0000002.000000-1.0000000.0000000.000000
75%18988.00000048.0000001435.00000021.000000317.0000003.000000-1.0000000.0000000.000000
max25317.00000095.000000102127.00000031.0000003881.00000055.000000854.000000275.0000001.000000
 

总计记录25317人。 年龄分布:18-95; balance(存款)分布:-8019 - 102127,balance的标准差2999.822811,比较大,看到平均存款1357,上四分位1435,下四分位才只有73元,存款的差距还是蛮大的,万恶的资本主义; day(最后一次联系是几号):1-31,很明显一个月从1号开始,从31号结束,这个特征很可能和预测无关联; duration(交流时长):0-3881,这个猜测是持续的天数; campaign(交流次数):1-55 pdays(上次联系后过了多久):-1 - 854,这里没有999,应该是-1为没有联系,>-1就是期间几天前曾联系过; previous(活动前交流次数):0-275,平均0.591737,不到1次;

In [5]:
#工作和购买理财的关系
y_0 = data_train.job[data_train.y == 0].value_counts() y_1 = data_train.job[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"job to buy") plt.ylabel(u"counts") plt.show() 
 
In [14]:
#婚姻和购买理财的关系
#看不出啥结果
y_0 = data_train.marital[data_train.y == 0].value_counts() y_1 = data_train.marital[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"marital to buy") plt.ylabel(u"counts") plt.show() 
 
In [15]:
#教育和购买理财的关系
y_0 = data_train.education[data_train.y == 0].value_counts() y_1 = data_train.education[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"education to buy") plt.ylabel(u"counts") plt.show() 
 
In [24]:
#上次活动结果和购买理财的关系
#发现poutcome指标相当重要,上次活动成功的客户这次也购买理财的比例非常高
y_0 = data_train.poutcome[data_train.y == 0].value_counts() y_1 = data_train.poutcome[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() 
 
 

判断day、month是和客户交流的月份和日份,很容易被当成噪音特征。用统计来说话。

In [3]:
#月份对结果的影响
y_0 = data_train.month[data_train.y == 0].value_counts() y_1 = data_train.month[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #我们发现may(0.019789)和dec (0.001896)相差10倍,所以这个特征还是蛮重要的 
 
 
may    0.019789
aug    0.014773
jul    0.014022
apr    0.012916
jun    0.011613
feb    0.009954
nov    0.009045
oct    0.007465
sep    0.006241
mar    0.005727
jan    0.003515
dec    0.001896
Name: month, dtype: float64
In [4]:
#日对结果的影响
y_0 = data_train.day[data_train.y == 0].value_counts() y_1 = data_train.day[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #发现30号最容易出单,31号极不容易出单 
 
 
30    0.005964
13    0.005253
15    0.005135
4     0.005016
14    0.004977
12    0.004898
18    0.004898
5     0.004661
20    0.004661
21    0.004621
11    0.004582
8     0.004463
16    0.004345
2     0.004345
3     0.004266
17    0.003950
9     0.003910
6     0.003792
27    0.003792
7     0.003476
22    0.003436
28    0.003160
23    0.002923
25    0.002646
26    0.002528
10    0.002528
29    0.002409
19    0.002370
1     0.001777
24    0.001303
31    0.000869
Name: day, dtype: float64
In [7]:
#'job','marital','education','default','housing','loan','contact','poutcome'这8个字段都
#要做one-hot编码预处理,暂时先将unknown作为一个特征项。

dummy = pd.get_dummies(data_train[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) dummyTest = pd.get_dummies(data_test[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) data_train = pd.concat([dummy, data_train], axis=1) data_train.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_test = pd.concat([dummyTest, data_test], axis=1) data_test.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) print("数值处理1:标签指标one-hot编码处理") #default、housing、loan都是2分类的指标,删除其中一个即可 #data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) #data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) data_train['pdays'].replace(-1,999,inplace=True) data_test['pdays'].replace(-1,999,inplace=True) print("数值处理2:pdays将-1替换为999") 
 
数值处理1:标签指标one-hot编码处理
数值处理2:pdays将-1替换为999
In [20]:
data_train.head()
Out[20]:
 job_admin.job_blue-collarjob_entrepreneurjob_housemaidjob_managementjob_retiredjob_self-employedjob_servicesjob_studentjob_technician...poutcome_otherpoutcome_successpoutcome_unknownagebalancedurationcampaignpdayspreviousy
00000100000...001432911502-100
10000000001...10042507699125120
21000000000...00147104772-100
30000100000...00128-9941742-100
40000000001...0014229741875-100

5 rows × 39 columns

In [6]:
#测试单一特征和目标的关系
#print('无违约:',data_train[data_train['default_yes']==0].count())
#print('有违约:',data_train[data_train['default_yes']==1].count())
print(data_train['default_yes'].value_counts()) print(data_test['default_yes'].value_counts()) #data_train.groupby(["default_yes"], as_index=False)['y'].count() 
 
0    24869
1      448
Name: default_yes, dtype: int64
0    24869
1      448
Name: default_yes, dtype: int64
Out[6]:
 job_admin.job_blue-collarjob_entrepreneurjob_housemaidjob_managementjob_retiredjob_self-employedjob_servicesjob_studentjob_technician...poutcome_otherpoutcome_successpoutcome_unknownagebalancedurationcampaignpdayspreviousy
job_admin.1.000000-0.188846-0.067402-0.059086-0.185311-0.082905-0.068534-0.115037-0.052838-0.161626...0.0135770.004200-0.018840-0.063839-0.029366-0.017629-0.0185590.0218030.0098210.000298
job_blue-collar-0.1888461.000000-0.098047-0.085951-0.269568-0.120600-0.099695-0.167341-0.076863-0.235113...-0.003148-0.0564530.025315-0.044350-0.0562480.0105050.0099460.016488-0.019208-0.075065
job_entrepreneur-0.067402-0.0980471.000000-0.030677-0.096212-0.043044-0.035583-0.059726-0.027433-0.083915...-0.018659-0.0149690.0134910.0233310.0102880.003927-0.001803-0.014705-0.007958-0.022519
job_housemaid-0.059086-0.085951-0.0306771.000000-0.084342-0.037733-0.031193-0.052357-0.024049-0.073562...-0.018467-0.0095110.0297350.0847540.008013-0.0013370.002692-0.032321-0.013129-0.015041
job_management-0.185311-0.269568-0.096212-0.0843421.000000-0.118343-0.097829-0.164209-0.075424-0.230713...0.0082880.025737-0.019421-0.0270750.078719-0.0100900.016234-0.0036190.0259460.035234
job_retired-0.082905-0.120600-0.043044-0.037733-0.1183431.000000-0.043767-0.073464-0.033743-0.103217...-0.0016190.054668-0.0246160.4512850.0463700.026569-0.031805-0.0030460.0075110.083868
job_self-employed-0.068534-0.099695-0.035583-0.031193-0.097829-0.0437671.000000-0.060730-0.027894-0.085325...-0.0025260.0046320.000565-0.0099730.0007820.002657-0.003602-0.007433-0.0040290.001078
job_services-0.115037-0.167341-0.059726-0.052357-0.164209-0.073464-0.0607301.000000-0.046821-0.143221...0.001367-0.0207960.005367-0.060838-0.0366400.000364-0.0016150.011358-0.006309-0.026688
job_student-0.052838-0.076863-0.027433-0.024049-0.075424-0.033743-0.027894-0.0468211.000000-0.065784...0.0307330.049948-0.045026-0.1957200.000799-0.005165-0.0215390.0246430.0142060.069058
job_technician-0.161626-0.235113-0.083915-0.073562-0.230713-0.103217-0.085325-0.143221-0.0657841.000000...-0.001704-0.0040720.011010-0.063478-0.015668-0.0116050.023601-0.015579-0.004059-0.004942
job_unemployed-0.060802-0.088448-0.031568-0.027673-0.086792-0.038829-0.032099-0.053879-0.024747-0.075699...-0.0127160.0160130.0090080.0054620.0132520.023554-0.021663-0.013660-0.0082300.023980
job_unknown-0.029004-0.042192-0.015059-0.013201-0.041402-0.018523-0.015312-0.025701-0.011805-0.036110...-0.0169100.0072560.0113270.0450260.015479-0.0034830.012938-0.014763-0.0062410.001438
marital_divorced0.027961-0.0623610.0030400.0167860.0021960.053472-0.0173810.026199-0.0485900.007188...-0.001968-0.0028700.0019990.165888-0.0283560.012815-0.0198300.003130-0.0047180.002723
marital_married-0.0561020.1255320.0448940.045362-0.0335450.0736540.002060-0.019572-0.161869-0.058949...-0.028606-0.0229590.0283770.2845160.026577-0.0225570.039452-0.027329-0.006380-0.054746
marital_single0.041159-0.092241-0.050951-0.0612040.034904-0.1179580.0100810.0027030.2103810.058978...0.0324880.026989-0.032260-0.426833-0.0087880.015434-0.0288250.0274860.0102780.057574
education_primary-0.1101050.348314-0.0116300.164128-0.1758140.119077-0.040373-0.058845-0.042160-0.161923...-0.004174-0.0332140.0327730.194451-0.026575-0.0000340.012495-0.011621-0.012038-0.043154
education_secondary0.2208280.037604-0.051630-0.062505-0.405359-0.037429-0.0539900.2008330.0078250.155845...0.004079-0.0284710.002800-0.093500-0.0746070.000568-0.0221850.017952-0.011050-0.038460
education_tertiary-0.146154-0.3204290.061969-0.0553800.601275-0.0624590.095847-0.170206-0.024021-0.036790...0.0031280.050667-0.030504-0.0830800.094686-0.0010670.011818-0.0067200.0249550.066901
education_unknown-0.0212080.0107600.008699-0.012186-0.0410170.022015-0.010919-0.0085020.110442-0.014967...-0.0097910.0152870.0036560.0736400.0183800.0010660.006071-0.008665-0.0076000.021087
default_yes-0.0051450.0127170.029592-0.007002-0.008630-0.0089480.008743-0.002526-0.017596-0.004049...-0.010326-0.0214320.038027-0.019272-0.068299-0.0113270.019978-0.029440-0.015293-0.024608
housing_yes0.0433690.1769370.017130-0.074215-0.063260-0.159975-0.0236080.065284-0.085328-0.016506...0.032566-0.096285-0.060478-0.187364-0.0687800.002778-0.0247080.1217400.032667-0.143589
loan_yes0.0326120.0128960.040955-0.012334-0.032051-0.016304-0.0068780.036603-0.0580820.009240...-0.011531-0.0535730.035315-0.016286-0.085854-0.0113560.020537-0.024458-0.006240-0.065231
contact_cellular-0.002431-0.128760-0.003751-0.0187650.101878-0.0106610.012462-0.0297560.0275960.055623...0.1077640.104342-0.263887-0.0725730.0158210.018666-0.0274610.2254380.1220620.134791
contact_telephone-0.012570-0.002537-0.0120750.044074-0.0315650.1058080.001363-0.0155830.026084-0.037147...0.0250710.009642-0.0263060.1742840.042785-0.0155700.0561060.0176720.0213140.020747
contact_unknown0.0094110.1372900.010535-0.004194-0.090346-0.046364-0.0138960.039893-0.043332-0.038483...-0.127399-0.1153850.292862-0.018304-0.039998-0.011223-0.001567-0.247577-0.140445-0.153572
poutcome_failure0.0122660.0029670.003890-0.0196210.0040270.000278-0.0017320.0043890.007463-0.010275...-0.073107-0.064271-0.734653-0.0061660.012700-0.019398-0.0890850.7044950.3138980.011927
poutcome_other0.013577-0.003148-0.018659-0.0184670.008288-0.001619-0.0025260.0013670.030733-0.001704...1.000000-0.038796-0.443453-0.0214500.008611-0.002584-0.0216040.3843970.2957470.038399
poutcome_success0.004200-0.056453-0.014969-0.0095110.0257370.0546680.004632-0.0207960.049948-0.004072...-0.0387961.000000-0.3898560.0392460.0317580.045017-0.0584430.2230250.1740360.305806
poutcome_unknown-0.0188400.0253150.0134910.029735-0.019421-0.0246160.0005650.005367-0.0450260.011010...-0.443453-0.3898561.000000-0.002015-0.029327-0.0038720.109688-0.868084-0.485981-0.170697
age-0.063839-0.0443500.0233310.084754-0.0270750.451285-0.009973-0.060838-0.195720-0.063478...-0.0214500.039246-0.0020151.0000000.0937400.0004160.006171-0.0264310.0065750.029916
balance-0.029366-0.0562480.0102880.0080130.0787190.0463700.000782-0.0366400.000799-0.015668...0.0086110.031758-0.0293270.0937401.0000000.026042-0.0104190.0010320.0157920.057564
duration-0.0176290.0105050.003927-0.001337-0.0100900.0265690.0026570.000364-0.005165-0.011605...-0.0025840.045017-0.0038720.0004160.0260421.000000-0.0877800.0000400.0013150.394746
campaign-0.0185590.009946-0.0018030.0026920.016234-0.031805-0.003602-0.001615-0.0215390.023601...-0.021604-0.0584430.1096880.006171-0.010419-0.0877801.000000-0.089224-0.031667-0.075173
pdays0.0218030.016488-0.014705-0.032321-0.003619-0.003046-0.0074330.0113580.024643-0.015579...0.3843970.223025-0.868084-0.0264310.0010320.000040-0.0892241.0000000.4116880.107565
previous0.009821-0.019208-0.007958-0.0131290.0259460.007511-0.004029-0.0063090.014206-0.004059...0.2957470.174036-0.4859810.0065750.0157920.001315-0.0316670.4116881.0000000.088337
y0.000298-0.075065-0.022519-0.0150410.0352340.0838680.001078-0.0266880.069058-0.004942...0.0383990.305806-0.1706970.0299160.0575640.394746-0.0751730.1075650.0883371.000000

36 rows × 36 columns

In [8]:
#违约记录&订购理财的关系
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts() y_1 = data_train.default_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"default") plt.ylabel(u"counts") plt.show() 
 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-a047910fcfb8> in <module>
      2 fig = plt.figure()  3 fig.set(alpha=0.2) # 设定图表颜色alpha参数 ----> 4 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts()  5 y_1 = data_train.default_yes[data_train.y == 1].value_counts()  6 df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) NameError: name 'data_atrain' is not defined
 
<Figure size 432x288 with 0 Axes>
In [9]:
#住房贷款&订购理财的关系
#可以看出没有房贷购买理财的比例稍微高一些,但不明显,可能是还房贷的人资金压力稍大
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_train.housing_yes[data_train.y == 0].value_counts() y_1 = data_train.housing_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"housing") plt.ylabel(u"counts") plt.show() #发现没有违约的人买理财比例略高 
 
<Figure size 432x288 with 0 Axes>
 
In [19]:
#个人贷款&订购理财的关系
#可以看出两种情况差别不大
fig = plt.figure() fig.set(alpha=0.2) # 设定图表颜色alpha参数 y_0 = data_train.loan_yes[data_train.y == 0].value_counts() y_1 = data_train.loan_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"loan") plt.ylabel(u"counts") plt.show() data_train[["loan_yes", "y"]].groupby(['loan_yes'], as_index=False).mean().sort_values(by='y', ascending=False) #可以看出12.6%的无个人贷的人买了理财,有贷款的只有6.89%买了理财 #说明无个贷买理财的机会比较大 
 
<Figure size 432x288 with 0 Axes>
 
Out[19]:
 loan_yesy
000.126117
110.068983
In [7]:
#使用直方图来看看那个区段年龄的人最多购买或不购买
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'age', bins=20) plt.show() #貌似看不出什么问题,只能说明买理财的年龄不大集中,不买的集中在30-40岁之间 
 
In [8]:
#使用直方图来看看“距离上次活动最后一次联系该客户,过去了多久”的人最多购买或不购买
#看来是时间越短,购买率越高,说明pdays是相当重要的指标
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'pdays', bins=20) plt.show() #pdays指标让人读不懂,以后重点解决 
 
In [9]:
y = data_train['y'] X = data_train[data_train.columns[: -1]] X.info() 
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 51 columns):
month_apr              25317 non-null uint8
month_aug              25317 non-null uint8
month_dec              25317 non-null uint8
month_feb              25317 non-null uint8
month_jan              25317 non-null uint8
month_jul              25317 non-null uint8
month_jun              25317 non-null uint8
month_mar              25317 non-null uint8
month_may              25317 non-null uint8
month_nov              25317 non-null uint8
month_oct              25317 non-null uint8
month_sep              25317 non-null uint8
job_admin.             25317 non-null uint8
job_blue-collar        25317 non-null uint8
job_entrepreneur       25317 non-null uint8
job_housemaid          25317 non-null uint8
job_management         25317 non-null uint8
job_retired            25317 non-null uint8
job_self-employed      25317 non-null uint8
job_services           25317 non-null uint8
job_student            25317 non-null uint8
job_technician         25317 non-null uint8
job_unemployed         25317 non-null uint8
job_unknown            25317 non-null uint8
marital_divorced       25317 non-null uint8
marital_married        25317 non-null uint8
marital_single         25317 non-null uint8
education_primary      25317 non-null uint8
education_secondary    25317 non-null uint8
education_tertiary     25317 non-null uint8
education_unknown      25317 non-null uint8
default_no             25317 non-null uint8
default_yes            25317 non-null uint8
housing_no             25317 non-null uint8
housing_yes            25317 non-null uint8
loan_no                25317 non-null uint8
loan_yes               25317 non-null uint8
contact_cellular       25317 non-null uint8
contact_telephone      25317 non-null uint8
contact_unknown        25317 non-null uint8
poutcome_failure       25317 non-null uint8
poutcome_other         25317 non-null uint8
poutcome_success       25317 non-null uint8
poutcome_unknown       25317 non-null uint8
ID                     25317 non-null int64
age                    25317 non-null int64
balance                25317 non-null int64
duration               25317 non-null int64
campaign               25317 non-null int64
pdays                  25317 non-null int64
previous               25317 non-null int64
dtypes: int64(7), uint8(44)
memory usage: 2.4 MB
In [ ]:
#查看相关矩阵,连带y也作为指标
#data_train.corr()
#查看相关矩阵热图
#colormap = plt.cm.RdBu
#plt.figure(figsize=(39,37))
#plt.title('Correlation of Features', y=1.05, size=37) #sns.heatmap(data_train.astype(float).corr(),linewidths=0.1,vmax=1.0, # square=True, cmap=colormap, linecolor='white', annot=True) #plt.show() 
In [11]:
print("数值处理3:数值指标Scaler变换")
scaler = StandardScaler() X = scaler.fit_transform(X) data_test = scaler.fit_transform(data_test) #数据分割,用于测试 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=90) 
 
数值处理3:数值指标Scaler变换
In [12]:
# print('决策树')
# clf = DecisionTreeClassifier(random_state=11)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # print('随机森林') # clf = RandomForestClassifier(n_estimators=10, random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('逻辑回归') # clf = LogisticRegression() # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print('AdaBoost') adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11) adaBoost.fit(X_train, y_train) predictionsByadaBoost = adaBoost.predict(X_test) print(classification_report(y_test, predictionsByadaBoost)) print(cross_val_score(adaBoost,X_train, y_train,scoring='f1')) print(cross_val_score(adaBoost,X_test, y_test,scoring='f1')) print(adaBoost.score(X_test, y_test)) pred = adaBoost.predict_proba(X_test) dataPred = pd.DataFrame(pred, columns=['pred0', 'pred']) dataPred.drop('pred0', axis=1, inplace=True) print(dataPred) y_predprob = adaBoost.predict_proba(X_test) y_predprob = y_predprob[:, 1] predictions_train = adaBoost.predict(X_train) y_predprob_train = adaBoost.predict_proba(X_train) y_predprob_train = y_predprob_train[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost)) print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train)) print
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
开始做本项目时对Struts架构理解的并不是很深刻,导致项目有些模块 互相耦合的比较紧密,不利于以后扩展和改进。 1.SearchGene写的比较乱,程序可读性以及可维护性都不好,下一步拟通过接口编程来实现 查询字符串的自动生成。 2.BusinessDelegate写成了一个Singleton是否合适,每一个客户请求后都会new 一个业务对象 对其服务,是否有更好的解决办法以提高程序的效率。 3.Struts-config文件的Action设计的有些散乱,下一步改进。 4.业务对象和DAO合在一起了,降低了程序的扩展性和可维护性,下一步会把者分开以降低各层 之间的耦合。 5.Struts1.2.7 的 Validator验证框架 不稳定,只能显示第一个参数,同时执行多个验证时参数的显示顺序也不对,是程序原因还是 配置不正确,再上网查找。 6.本项目大部分错误都用异常的形式来处理,异常虽可以使程序清晰,但也会消耗大量资源,若某些错误如密码错,余额不足等多次 出现则服务器响应速度必定会很慢,下一步将经常发生的错误使用硬代码来处理,减少资源浪费。 7.持久层操作大多依赖存储过程和触发器程序的部署会比较复杂,而且会使持久层和数据库耦合过紧,不利于维护,下一步准备用Hibernate 架构改进持久层,如有条件则还可用Spring框架来规范业务层,和统一整个项目。(学习Hibernate和Spring大约1个月时间)。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值