「二分类算法」提供银行精准营销解决方案

最新推荐文章于 2022-08-17 12:48:14 发布

qq_32811823

最新推荐文章于 2022-08-17 12:48:14 发布

阅读量1.7k

点赞数 2

分类专栏：数据分析文章标签：数据分析数据挖掘营销

本文链接：https://blog.csdn.net/qq_32811823/article/details/97611866

版权

银行精准营销解决方案

营销活动以电话为基础，一般，银行的客服人员需要联系客户至少一次。

数据集中包含有客户年龄，职业婚姻，教育水平等信息，通过这些信息建模，预测客户是否将认购该银行的产品

一.数据和库的导入

import datetime
import pandas as pd
import numpy as np
import os
import seaborn as sns
import re
import matplotlib.pyplot as pl
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, fbeta_score, \
    make_scorer, mean_absolute_error, roc_auc_score, precision_score

warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
pl.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
pl.rcParams['axes.unicode_minus'] = False  # 用来正常显示

%matplotlib inline

/opt/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/opt/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)

对数据EDA

path = "/home/kesci/input/dhyx/"
train_path = path + "train_set.csv"
df = pd.read_csv("/home/kesci/input/dhyx/train_set.csv")
df.head(2)

	ID	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y
0	1	43	management	married	tertiary	no	291	yes	no	unknown	9	may	150	2	-1	0	unknown	0
1	2	42	technician	divorced	primary	no	5076	yes	no	cellular	7	apr	99	1	251	2	other	0

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 18 columns):
ID           25317 non-null int64
age          25317 non-null int64
job          25317 non-null object
marital      25317 non-null object
education    25317 non-null object
default      25317 non-null object
balance      25317 non-null int64
housing      25317 non-null object
loan         25317 non-null object
contact      25317 non-null object
day          25317 non-null int64
month        25317 non-null object
duration     25317 non-null int64
campaign     25317 non-null int64
pdays        25317 non-null int64
previous     25317 non-null int64
poutcome     25317 non-null object
y            25317 non-null int64
dtypes: int64(9), object(9)
memory usage: 3.5+ MB

df.columns

Index(['ID', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')

定义特征和标签

result = df["y"]
feature  = df[df.columns[1: -1]]

对数据标准化和热编码处理

def f_clean(feature):
    scaler = MinMaxScaler()
    numerical = ['age', 'balance', 'day', 'duration', 'pdays', 'previous']
    feature[numerical] = scaler.fit_transform(feature[numerical])

    features = pd.get_dummies(feature)
    return features

features_hot = f_clean(feature)
features_hot.shape

(25317, 51)

# result.replace('no', 0, inplace=True)
# result.replace('yes', 1, inplace=True)

二.训练模型

划分训练集和测试集

f_train, f_test, r_train, r_test = train_test_split(features_hot, \
                                                    result, test_size=0.25, random_state=40)

先用简单的模型观察强特征

clf = RandomForestClassifier()
clf.fit(f_train, r_train)
pred = clf.predict(f_test)
results = {
   }

results['acc_train'] = accuracy_score(pred, r_test)

results['f_train'] = fbeta_score(pred, r_test, beta=0.5)

print(results)
print('\n', classification_report(pred, r_test))

{'acc_train': 0.893522906793049, 'f_train': 0.3420195439739414}

               precision    recall  f1-score   support

           0       0.97      0.91      0.94      5941
           1       0.31      0.59      0.41       389

   micro avg       0.89      0.89      0.89      6330
   macro avg       0.64      0.75      0.67      6330
weighted avg       0.93      0.89      0.91      6330

def feature_plot(importances, X_train, y_train):
    
    # Display the five most important features
    indices = np.argsort(importances)[::-1]
    columns = X_train.columns.values[indices[:5]]
    values = importances[indices][:5]

    # Creat the plot
    fig = pl.figure(figsize = (9,5))
    pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
    rects = pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
                label = "Feature Weight")
    
    # make bar chart higher to fit the text label
    axes = pl.gca()
    axes.set_ylim([0, np.max(values) * 1.1])

    # add text label on each bar
    delta = np.max(values) * 0.02
    
    for rect in rects:
        height = rect.get_height()
        pl.text(rect.get_x() + rect.get_width()/2., 
                height + delta, 
                '%.2f' % height,
                ha='center', 
                va='bottom')
    
    # Detect if xlabels are too long
    rotation = 0 
    for i in columns:
        if len(i) > 20: 
            rotation = 10 # If one is longer than 20 than rotate 10 degrees 
            break
    pl.xticks(np.arange(5), columns, rotation = rotation)
    pl.xlim((-0.5, 4.5))
    pl.ylabel("Weight", fontsize = 12)
    pl.xlabel("Feature", fontsize =

最低0.47元/天解锁文章

qq_32811823

关注

2
点赞
踩
16

收藏

觉得还不错? 一键收藏
0
评论
「二分类算法」提供银行精准营销解决方案

银行精准营销解决方案营销活动以电话为基础，一般，银行的客服人员需要联系客户至少一次。数据集中包含有客户年龄，职业婚姻，教育水平等信息，通过这些信息建模，预测客户是否将认购该银行的产品一.数据和库的导入import datetimeimport pandas as pdimport numpy as npimport osimport seaborn as snsimport re...
复制链接

扫一扫