Lender数据流失率分析

Lender数据流失率分析

读入数据并查看情况:

# coding=utf-8
import pandas as pd
import numpy as np
import sys
reload(sys)
import matplotlib.pyplot as plt
sys.setdefaultencoding('utf-8')
file_name = "file://localhost/home/chenyu/Downloads/Lender/output_test_new.json"

# 确定churn_label列为最终判定列
data = pd.read_json(file_name, lines=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

data.loc[data['churn_label'] == 'Y', 'churn_label'] = 0

data.loc[data['churn_label'] == 'N', 'churn_label'] = 1

data['churn_label'] = data['churn_label'].astype('int')

数据处理

no_data_list = ['dtt_amount', 'dtt_count', 'dtt_hold_amount', 'dtt_total_int', 'jdd_hold_amount', 'last_downgrade_date',
                'refer_count', 'register_activity', 'vip_downgrade_date', 'wjt_hold_amount', 'ygt_amount', 'ygt_count',
                'ygt_total_int', 'ymd_sval', 'yyy_amount', 'yyy_count', 'yyy_hold_amount', 'yyy_total_int',
                'zht_amount',
                'zht_hold_amount']

data = data.drop(no_data_list, axis=1)

# 去除相关性很低特证
corr_data = data.corr()

经过人工筛选后得到的最后特征值

pre_list = ['lender_origin', 'lender_gender', 'lender_horoscope', 'hold_amt', 'l_invest_loan', 'activity_recv_all',
            'addf_count', 'quit_ttz_amount', 'new_auto_invest', 'holding_time', 'hold_buy', 'customer_level',
            'acc_vip_level', 'ssn_city', 'lender_l_invest_loan',
            'lender_age', 'lender_max_invest_per', 'invest_count', 'total_bonus_point'
    , 'invest_amt', 'quit_ttz_count', 'register_channel', 'total_act_recv', 'referer_type', 'vip_bonus_point',
            'lender_max_invest_loan', 'hold_count', 'addf_amt', 'max_invest_loan',
            'invest_buy', 'lender_max_invest_amt', 'ssn_province', 'daily_acc_point',
            'last_daily_bonus_point', 'lender_l_addf_amt', 'quit_ttz_part_count']

做图看看来源,年龄,星座统计


fig = plt.figure(figsize=(20, 20))
fig.set(alpha=0.5)

# 分析来源
plt.subplot2grid((2, 3), (0, 0))
data.lender_origin.value_counts().plot(kind="bar")
plt.title("来源图")
plt.ylabel('来源')

# 分析年龄
# 年龄想统计,但是太多没有数据,直接填充一个均值标准差

average_age = data.lender_age.mean()
std_age = data.lender_age.std()
count_nan_age = data.lender_age.isnull().sum()

# 求年龄随机数,范围在 (mean - std, mean + std)
rand_1 = np.random.randint(average_age - std_age, average_age + std_age,
                           size=count_nan_age)

# 将随机数填充进 Age 的丢失值中
data["lender_age"][np.isnan(data["lender_age"])] = rand_1

plt.subplot2grid((2, 3), (0, 1))
plt.hist(data.lender_age, bins=20)
plt.title("年龄图")
plt.ylabel('年龄')

# 分析星座
plt.subplot2grid((2, 3), (0, 2))
data.lender_horoscope.value_counts().plot(kind="bar")
plt.title("星座图")
plt.ylabel('星座')

# 分析持有数量
plt.subplot2grid((2, 3), (1, 0))
plt.scatter(data.churn_label, data.hold_amt)
plt.ylabel("持有数量")
plt.grid(b=True, which='major', axis='y')
plt.title("持有数量和丢失率")

# 分析投资次数
plt.subplot2grid((2, 3), (1, 1))
plt.scatter(data.churn_label, data.invest_count)
plt.ylabel("投资次数")
plt.grid(b=True, which='major', axis='y')
plt.title("投资次数和丢失率")

plt.show()

填充部分缺失数据

mean_fill_list = ['activity_recv_all', 'total_act_recv', 'lender_l_addf_amt', 'lender_age']

for x in mean_fill_list:
    average = pre_data[x].mean()
    std = pre_data[x].std()
    count_nan = pre_data[x].isnull().sum()

    # 求年龄随机数,范围在 (mean - std, mean + std)
    rand_1 = np.random.randint(average - std, average + std,
                               size=count_nan)
    # 将随机数填充进的丢失值中
    pre_data[x][np.isnan(pre_data[x])] = rand_1

字符串和中文onehot编码

dummies_new_auto_invest = pd.get_dummies(pre_data['new_auto_invest'], prefix='new_auto_invest')

dummies_l_invest_loan = pd.get_dummies(pre_data['l_invest_loan'], prefix='l_invest_loan')

dummies_register_channel = pd.get_dummies(pre_data['register_channel'], prefix='register_channel')

dummies_lender_origin = pd.get_dummies(pre_data['lender_origin'], prefix='lender_origin')

dummies_lender_max_invest_loan = pd.get_dummies(pre_data['lender_max_invest_loan'], prefix='lender_max_invest_loan')

dummies_lender_gender = pd.get_dummies(pre_data['lender_gender'], prefix='lender_gender')

dummies_lender_horoscope = pd.get_dummies(pre_data['lender_horoscope'], prefix='lender_horoscope')

dummies_customer_level = pd.get_dummies(pre_data['customer_level'], prefix='customer_level')

dummies_referer_type = pd.get_dummies(pre_data['referer_type'], prefix='referer_type')

dummies_ssn_city = pd.get_dummies(pre_data['ssn_city'], prefix='ssn_city')

dummies_ssn_province = pd.get_dummies(pre_data['ssn_province'], prefix='ssn_province')

X = pd.concat(
    [pre_data, dummies_new_auto_invest, dummies_l_invest_loan, dummies_register_channel, dummies_lender_origin,
     dummies_lender_max_invest_loan, dummies_lender_gender, dummies_lender_horoscope, dummies_customer_level,
     dummies_referer_type, dummies_ssn_city, dummies_ssn_province], axis=1)

X.drop(
    ['new_auto_invest', 'l_invest_loan', 'register_channel', 'lender_origin', 'lender_max_invest_loan', 'lender_gender',
     'lender_horoscope', 'customer_level', 'referer_type', 'ssn_city', 'ssn_province'], axis=1, inplace=True)

y = data['churn_label']

使用决策树模型进行学习

# 数据分割。
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

# 从sklearn.tree中导入决策树分类器。
from sklearn.tree import DecisionTreeClassifier

# 使用默认配置初始化决策树分类器。
dtc = DecisionTreeClassifier()
# 使用分割到的训练数据进行模型学习。
dtc.fit(X_train, y_train)
# 用训练好的决策树模型对测试特征数据进行预测。
y_predict = dtc.predict(X_test)

# 从sklearn.metrics导入classification_report。
from sklearn.metrics import classification_report

# 输出预测准确性。
print (dtc.score(X_test, y_test))
# 输出更加详细的分类性能。
print (classification_report(y_predict, y_test, target_names=['流失', '没有流失']))
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值