对银行客户复购频率的进行三分类预测。根据客户信息(包括基础客户画像信息、产品购买行为信息以及第三方客户画像补充信息)预测客户复购行为,对客户信息数据进行预处理,通过特征选择和特征工程不断构建新特征,提高模型性能,通过随机森林建模预测客户复购频率,并根据客户平均价值(低频1、中频3、高频5),在独立样本上检验预测准确性(加权准确性),识别黏性客户,分析其需求,并向他们推送新产品,进行客户关系管理
比赛用到的数据都是脱敏的,代码分享在下面:
# loading packages
#载入工具包
import os
import pandas as pd #数据处理和分析工具包
import numpy as np #科学运算包
# plotting packages
%matplotlib inline
#类似Matlab的绘图工具包
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimSun', 'Times New Roman'] # 汉字字体集
plt.rcParams['font.size'] = 10 # 字体大小
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.cm as cm
import matplotlib.colors as clrs
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# load data
x1train = pd.read_csv('X1_train.csv')
x2train = pd.read_csv('X2_train.csv')
x3train = pd.read_csv('X3_train.csv')
ytrain=pd.read_csv('y_train.csv')
xtrain=[x1train,x2train,x3train]
x1test = pd.read_csv('X1_test.csv')
x2test = pd.read_csv('X2_test.csv')
x3test = pd.read_csv('X3_test.csv')
xtest=[x1test,x2test,x3test]
# check the data
for i in xtrain:
print(i.dtypes)
print("Size of the dataset (row, col): ", i.shape) #显示数据行,列数
print("\nFirst 5 rows\n", i.head(n=5)) #查看前五行
print("--------------------------------")
for i in xtest:
print(i.dtypes)
print("Size of the dataset (row, col): ", i.shape) #显示数据行,列数
print("\nFirst 5 rows\n", i.head(n=5)) #查看前五行
print("--------------------------------")
#train data processing
newB1=[]
for i in x2train['B1']:
newB1.append(int(i[0:4]))
x2train['B1']=newB1
print(x2train.head())
for i in xtrain:
for h in i.columns:
if h != '客户编号':
if i[h].dtypes==object:
i[h]=pd.factorize(i[h])[0]
else:
if np.var(i[h])<=1:
xe=i.drop([h],axis=1,inplace=True)
#print(i[h].isnull().sum())
se=i.dropna(axis=1,how='all',inplace=True)
re=i.fillna(i.mean(),inplace=True)
print("--------------------------------")
print(i.head(n=100))
for i in xtrain:
for h in i.columns:
print(i[h].isnull().sum())
mv_x2train=x2train.groupby('客户编号',as_index=False)[x2train.columns].mean()
print(mv_x2train)
freq=x2train['客户编号'].value_counts()
print(freq)
mv_x2train['freq']=freq
print(mv_x2train)
final=pd.merge(ytrain,mv_x2train,how='left')
pe=final.fillna(0,inplace=True)
print(final.head())
print('jjjjjjjjnbbbbbbbzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz')
X1train = x1train.drop(['客户编号','A11','A12','A19'],1)#,'A5','A9','A11','A12','A19'
X2train = final.drop(['客户编号','复购频率','B8','B10','B13','B14','B15','B16','B19'],1)#,'B8','B10','B13','B14','B15','B16','B19'
X3train = x3train.drop(['客户编号','C2','C5','C8','C10','C13','C29','C31','C33','C37','C42','C48','C49'],1)#,'C2','C5','C8','C10','C13','C29','C31','C33','C37','C42','C48','C49'
Xtrain_1=[X1train,X2train,X3train]
X_train=pd.concat(Xtrain_1,axis=1)
print(X_train.columns)
Xtrain=[X1train,X2train,X3train,X_train]
Ytrain=ytrain.drop('客户编号',1)
for i in Xtrain:
cm = np.corrcoef(i.values.T)
hm = sns.heatmap(cm, cbar=True, square=False, fmt='.3f', annot=True, annot_kws={'size':1}, yticklabels=i.columns, xticklabels=i.columns)
plt.show()
#test data processing
newB2=[]
for i in x2test['B1']:
newB2.append(int(i[0:4]))
x2test['B1']=newB2
print(x2test.head())
for i in xtest:
for h in i.columns:
if h != '客户编号':
if i[h].dtypes==object:
i[h]=pd.factorize(i[h])[0]
#print(i[h].isnull().sum())
setest=i.dropna(axis=1,how='all',inplace=True)
retest=i.fillna(i.mean(),inplace=True)
print("--------------------------------")
print(i.head(n=100))
for i in xtest:
for h in i.columns:
print(i[h].isnull().sum())
mv_x2test=x2test.groupby('客户编号',as_index=False)[x2test.columns].mean()
print(mv_x2test)
freqtest=x2test['客户编号'].value_counts()
print(freq)
mv_x2test['freq']=freqtest
print(mv_x2test)
finaltest=pd.merge(x1test['客户编号'],mv_x2test,how='left')
petest=finaltest.fillna(0,inplace=True)
print(finaltest.head())
print('jjjjjjjjnbbbbbbbzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz')
X1test = x1test.drop(['客户编号'],1)#,'A5','A9','A11','A12','A19'
X2test = finaltest.drop(['客户编号'],1)#,'B8','B10','B13','B14','B15','B16','B19'
X3test = x3test.drop(['客户编号'],1)#,'C2','C5','C8','C10','C13','C29','C31','C33','C37','C42','C48','C49'
Xtest_1=[X1test,X2test,X3test]
X_test=pd.concat(Xtest_1,axis=1)
print(X_test.columns)
Xtest=[X_test]
for i in Xtest:
cmtest = np.corrcoef(i.values.T)
hmtest= sns.heatmap(cm, cbar=True, square=False, fmt='.3f', annot=True, annot_kws={'size':1}, yticklabels=i.columns, xticklabels=i.columns)
plt.show()
#模型参数选择
def para_tune1(para, X, Y):
clf = RandomForestClassifier(n_estimators=para) # n_estimators 设置为 para
score = np.mean(cross_val_score(clf, X, Y, scoring='accuracy'))
return score
def accurate_curve1(para_range, X, Y, title):
score = []
for para in para_range:
score.append(para_tune1(para, X, Y))
plt.figure()
plt.title(title)
plt.xlabel('Paramters')
plt.ylabel('Score')
plt.grid()
plt.plot(para_range, score, 'o-')
return plt
def para_tune2(para, X, Y):
clf = RandomForestClassifier(n_estimators=300, max_depth=para)
score = np.mean(cross_val_score(clf, X, y, scoring='accuracy'))
return score
def accurate_curve2(para_range, X, Y, title):
score = []
for para in para_range:
score.append(para_tune2(para, X, Y))
plt.figure()
plt.title(title)
plt.xlabel('Paramters')
plt.ylabel('Score')
plt.grid()
plt.plot(para_range, score, 'o-')
return plt
for i in Xtrain:
x = pd.DataFrame(i)
y = Ytrain
g = accurate_curve1([2, 10, 50, 100, 150, 200, 250], x, y, 'n_estimator tuning')
h = accurate_curve2([2, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], x, y, 'max_depth tuning')
#pre-model
for i in Xtrain:
x = pd.DataFrame(i)
print(x.head(5))
y = Ytrain
print(y.head(5))
print('划分训练集测试集...')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8)
# 随机森林
model_rf = RandomForestClassifier(n_estimators=150,criterion='gini',max_depth=10, max_features='auto',random_state=1)
#tree_param_grid = { 'min_samples_split': list(range(1,50)),'n_estimators':list((80,100,200,300,600))}
#model_rf = GridSearchCV(RandomForestClassifier(),param_grid=tree_param_grid, cv=3)
model_rf.fit(x_train, y_train)
#先指定算法,然后把想找的参数写成字典传给param_grid,cv是交叉验证次数
print('随机森林分类...')
pre = model_rf.predict(x_test)
print('准确率:', accuracy_score(pre, y_test))
#模型预测--输出概率值
pre_p = model_rf.predict_proba(x_test)
print(pre_p)
print("-----------------------------------------------------------------------")
#可视化特征重要性
feature_importances = model_rf.feature_importances_
# 创建特征名列表
feature_names = list(i.columns)
# 创建一个DataFrame,包含特征名和其重要性得分
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
# 对特征重要性得分进行排序
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)
# 颜色映射
colors = plt.cm.viridis(np.linspace(0, 1, len(feature_names)))
# 可视化特征重要性
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(feature_importances_df['feature'], feature_importances_df['importance'], color=colors)
ax.invert_yaxis() # 翻转y轴,使得最大的特征在最上面
ax.set_xlabel('特征重要性', fontsize=12) # 图形的x标签
ax.set_title('随机森林特征重要性可视化',fontsize=16)
for n,p in enumerate(feature_importances_df['importance']):
ax.text(p + 0.01, n, str(round(p, 3)), va='center', fontname='Times New Roman', fontsize=10)
# # 设置图形样式
# plt.style.use('default')
ax.spines['top'].set_visible(False) # 去掉上边框
ax.spines['right'].set_visible(False) # 去掉右边框
# ax.spines['left'].set_linewidth(0.5)#左边框粗细
# ax.spines['bottom'].set_linewidth(0.5)#下边框粗细
# ax.tick_params(width=0.5)
# ax.set_facecolor('white')#背景色为白色
# ax.grid(False)#关闭内部网格线
plt.show()
#model
li=[]
for i in X_train.columns:
if i !="freq":
if i in X_test.columns:
li.append(i)
else:
li.append('freq')
x_train=X_train
x_test=X_test[li]
y_train=Ytrain
# 随机森林
model_rf_test = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=None,min_samples_leaf=1000, max_features='auto',random_state=1)
#tree_param_grid = { 'min_samples_split': list(range(1,50)),'n_estimators':list((80,100,200,300,600))}
#model_rf = GridSearchCV(RandomForestClassifier(),param_grid=tree_param_grid, cv=3)
model_rf_test.fit(x_train, y_train)
print('随机森林分类...')
y_pre_test = model_rf_test.predict(x_test)
print(y_pre_test)
rebuy=pd.DataFrame(y_pre_test,columns=['复购频率'])
rebuy.to_excel('predict.xlsx',index=0)
#模型评价
class ClassEval():
def __init__(self, preds,y_tests):
self.pre = preds
self.y_test = y_tests
self.C2 = None
'''计算混淆矩阵'''
def confusion_matrix(self):
self.C2 = metrics.confusion_matrix(self.y_test,self.pre, labels=[1,0])
return self.C2/len(self.pre)*100
'''绘制混淆矩阵热图'''
def C2_heatmap(self):
self.confusion_matrix()
#绘图
sns.set()
f, ax = plt.subplots(figsize=(8, 7))
TX = sns.heatmap(self.C2, annot=True, ax=ax, cmap="Spectral_r", fmt=".20g") # 热力图
#标题设置
ax.set_title("Confusion Matrix")
ax.set_xlabel("Predict")
ax.set_ylabel("Answer")
print("混淆矩阵")
'''计算准确率'''
def get_acc(self):
self.confusion_matrix()
print(type(self.C2))
#计算
acc = np.trace(self.C2)/self.C2.sum()
return acc
'''计算精准率'''
def get_precision(self):
self.confusion_matrix()
Precision = []
i = 0
for row in self.C2:
TP = row[i]
TP_FP = 0
for col in row:
TP_FP += col
Precision.append(TP/TP_FP)
i+=1
return Precision;
'''计算召回率'''
def get_Recall(self):
self.confusion_matrix()
Recall = []
i = 0
TP_FN = np.sum(self.C2, axis=0)
for row in self.C2:
TP = row[i]
Recall.append(TP/TP_FN[i])
i+=1
return Recall
'''计算F1指数'''
def get_F1(self):
self.confusion_matrix()
Precision = self.get_precision()
Recall = self.get_Recall()
F1 = []
for i in range(len(Precision)):
F1.append(2*Precision[i]*Recall[i] / (Precision[i] + Recall[i]))
return F1
'''计算kappa系数'''
def get_kappa(self):
self.confusion_matrix()
kappa = metrics.cohen_kappa_score(np.array(self.predict_label_list).astype(np.int16),np.array(self.answer_label_list).astype(np.int16))
return kappa
THRESHOLD = [.05, .10, .15] # threshold column
for i in Xtrain:
x = pd.DataFrame(i)
print(x.head(5))
y = Ytrain
print(y.head(5))
print('划分训练集测试集...')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8)
y_tests = np.where(y_test>1, 1, 0)
print(len(y_tests))
# 随机森林
model_rf = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=None,min_samples_leaf=1000, max_features='auto',random_state=1)
for h in THRESHOLD:
model_rf.fit(x_train, y_train)
preds = np.where(model_rf.predict_proba(x_test)[:,2] > h, 1, 0) # if prob > threshold, predict 1
print(len(preds))
RF_data = ClassEval(preds, y_tests)
# RF_data.C2_heatmap()
print("精确度",RF_data.get_acc())
print("精准率",RF_data.get_precision())
print("召回率",RF_data.get_Recall())
print("混淆矩阵",RF_data.confusion_matrix())