# ### 1. 导入数据
import os
import pandas as pd
trad_flow=pd.read_csv(r'RFM_TRAD_FLOW.csv',encoding='gbk')
trad_flow.head()
transID | cumid | time | amount | type_label | type | |
---|---|---|---|---|---|---|
0 | 9407 | 10001 | 14JUN09:17:58:34 | 199.0 | 正常 | Normal |
1 | 9625 | 10001 | 16JUN09:15:09:13 | 369.0 | 正常 | Normal |
2 | 11837 | 10001 | 01JUL09:14:50:36 | 369.0 | 正常 | Normal |
3 | 26629 | 10001 | 14DEC09:18:05:32 | 359.0 | 正常 | Normal |
4 | 30850 | 10001 | 12APR10:13:02:20 | 399.0 | 正常 | Normal |
# ### 2.通过 RFM方法 建立 模型 R(Recency)购买的时间 F(Frequency)购买的次数 M(Monetary)购买的金额
#2.1 通过计算F反应客户对打折产品的偏好
F=trad_flow.groupby(['cumid','type'])[['transID']].count()
F.head(10)
transID | ||
---|---|---|
cumid | type | |
10001 | Normal | 15 |
Presented | 8 | |
Special_offer | 2 | |
returned_goods | 2 | |
10002 | Normal | 12 |
Presented | 5 | |
returned_goods | 1 | |
10003 | Normal | 15 |
Presented | 8 | |
Special_offer | 1 |
#数据处理,空值填充为0。添加新列(兴趣度).做出交叉表的效果F_trans=pd.pivot_table(F,index='cumid',columns='type',values='transID')
F_trans['Special_offer']= F_trans['Special_offer'].fillna(0)
F_trans["interest"]=F_trans['Special_offer']/(F_trans['Special_offer']+F_trans['Normal'])
F_trans.head()
type | Normal | Presented | Special_offer | returned_goods | interest |
---|---|---|---|---|---|
cumid | |||||
10001 | 15.0 | 8.0 | 2.0 | 2.0 | 0.117647 |
10002 | 12.0 | 5.0 | 0.0 | 1.0 | 0.000000 |
10003 | 15.0 | 8.0 | 1.0 | 1.0 | 0.062500 |
10004 | 15.0 | 12.0 | 2.0 | 1.0 | 0.117647 |
10005 | 8.0 | 5.0 | 0.0 | 1.0 | 0.000000 |
#help(pd.pivot_table) ,不会可以用help查看pivot_table的用法以及参数
#2.2 通过计算M反应客户的价值信息
M=trad_flow.groupby(['cumid','type']).sum()
M.head()
transID | amount | ||
---|---|---|---|
cumid | type | ||
10001 | Normal | 456941 | 3608.0 |
Presented | 330459 | 0.0 | |
Special_offer | 49531 | 420.0 | |
returned_goods | 79927 | -694.0 | |
10002 | Normal | 315177 | 1894.0 |
M_trans=pd.pivot_table(M,index='cumid',columns='type',values='amount')
M_trans.head()
type | Normal | Presented | Special_offer | returned_goods |
---|---|---|---|---|
cumid | ||||
10001 | 3608.0 | 0.0 | 420.0 | -694.0 |
10002 | 1894.0 | 0.0 | NaN | -242.0 |
10003 | 3503.0 | 0.0 | 156.0 | -224.0 |
10004 | 2979.0 | 0.0 | 373.0 | -40.0 |
10005 | 2368.0 | 0.0 | NaN | -249.0 |
M_trans['Special_offer']= M_trans['Special_offer'].fillna(0) M_trans['returned_goods']= M_trans['returned_goods'].fillna(0) M_trans["value"]=M_trans['Normal']+M_trans['Special_offer']+M_trans['returned_goods'] M_trans.head()
type | Normal | Presented | Special_offer | returned_goods | value |
---|---|---|---|---|---|
cumid | |||||
10001 | 3608.0 | 0.0 | 420.0 | -694.0 | 3334.0 |
10002 | 1894.0 | 0.0 | 0.0 | -242.0 | 1652.0 |
10003 | 3503.0 | 0.0 | 156.0 | -224.0 | 3435.0 |
10004 | 2979.0 | 0.0 | 373.0 | -40.0 | 3312.0 |
10005 | 2368.0 | 0.0 | 0.0 | -249.0 | 2119.0 |
#2.3 通过计算R反应客户是否为沉默客户
from datetime import datetime
import time
#定义函数,用于将时间转化为需要的格式
def to_time(t):
out_t=time.mktime(time.strptime(t, '%d%b%y:%H:%M:%S')) ########此处修改为时间戳方便后面qcut函数分箱
return out_t
a="14JUN09:17:58:34"
print(to_time(a)) #稍微查看一下函数
1244973514.0
trad_flow["time_new"]= trad_flow.time.apply(to_time)
trad_flow.head()
transID | cumid | time | amount | type_label | type | time_new | |
---|---|---|---|---|---|---|---|
0 | 9407 | 10001 | 14JUN09:17:58:34 | 199.0 | 正常 | Normal | 1.244974e+09 |
1 | 9625 | 10001 | 16JUN09:15:09:13 | 369.0 | 正常 | Normal | 1.245136e+09 |
2 | 11837 | 10001 | 01JUL09:14:50:36 | 369.0 | 正常 | Normal | 1.246431e+09 |
3 | 26629 | 10001 | 14DEC09:18:05:32 | 359.0 | 正常 | Normal | 1.260785e+09 |
4 | 30850 | 10001 | 12APR10:13:02:20 | 399.0 | 正常 | Normal | 1.271049e+09 |
R=trad_flow.groupby(['cumid'])[['time_new']].max()
R.head()
time_new | |
---|---|
cumid | |
10001 | 1.284699e+09 |
10002 | 1.278129e+09 |
10003 | 1.282983e+09 |
10004 | 1.283057e+09 |
10005 | 1.282127e+09 |
# ### 3.构建模型,筛选目标客户
from sklearn import preprocessing threshold = pd.qcut(F_trans['interest'], 2, retbins=True)[1][1] threshold 0.08333333333333333
pd.qcut 的用法:基于分位数的离散化函数。根据等级或样本分位数将变量分解为大小相等的桶。例如,10个分位数的1000个值将产生一个分类对象,指示每个数据点的分位数成员资格。 用于分类,分2类
binarizer = preprocessing.Binarizer(threshold=threshold)
binarizer
Binarizer(copy=True, threshold=0.08333333333333333)
preprocessing.Binarizer(threshold=threshold) 用法:根据大于阈值映射为1的阈值将数据(将特征值设置为0或1),而值小于或等于阈值映射为0。默认阈值为0时,只有正值映射为1。二进制化是对文本计数数据的一种常见操作,分析人员可以决定只考虑功能的存在与否,而不是量化的出现次数。它也可以作为考虑布尔随机变量的估计器的预处理步骤(例如,在贝叶斯设置中使用Bernoulli分布建模)。
interest_q = pd.DataFrame(binarizer.transform(F_trans['interest'].values.reshape(-1, 1)))
interest_q.head() #查看建模后的数据,已经分成了2类
0 | |
---|---|
0 | 1.0 |
1 | 0.0 |
2 | 0.0 |
3 | 1.0 |
4 | 0.0 |
interest_q.index=F_trans.index #将上面的数据添加index和column
interest_q.columns=["interest"]
interest_q.head()
interest | |
---|---|
cumid | |
10001 | 1.0 |
10002 | 0.0 |
10003 | 0.0 |
10004 | 1.0 |
10005 | 0.0 |
threshold = pd.qcut(M_trans['value'], 2, retbins=True)[1][1] #剩下的同上,全部分2类
binarizer = preprocessing.Binarizer(threshold=threshold)
value_q = pd.DataFrame(binarizer.transform(M_trans['value'].values.reshape(-1, 1)))
value_q.index=M_trans.index
value_q.columns=["value"]
threshold = pd.qcut(R["time_new"], 2, retbins=True)[1][1]
binarizer = preprocessing.Binarizer(threshold=threshold)
time_new_q = pd.DataFrame(binarizer.transform(R["time_new"].values.reshape(-1, 1)))
time_new_q.index=R.index
time_new_q.columns=["time"]
analysis=pd.concat([interest_q, value_q,time_new_q], axis=1) #将RFM建模形成的3个表连接
analysis = analysis[['interest','value','time']]
analysis.head()
#添加一列,将分类结果换成中文
label = {
(0,0,0):'无兴趣-低价值-沉默',
(1,0,0):'有兴趣-低价值-沉默',
(1,0,1):'有兴趣-低价值-活跃',
(0,0,1):'无兴趣-低价值-活跃',
(0,1,0):'无兴趣-高价值-沉默',
(1,1,0):'有兴趣-高价值-沉默',
(1,1,1):'有兴趣-高价值-活跃',
(0,1,1):'无兴趣-高价值-活跃'
}
analysis['label'] = analysis[['interest','value','time']].apply(lambda x: label[(x[0],x[1],x[2])], axis = 1)
analysis.head()
interest | value | time | label | |
---|---|---|---|---|
cumid | ||||
10001 | 1.0 | 1.0 | 1.0 | 有兴趣-高价值-活跃 |
10002 | 0.0 | 0.0 | 0.0 | 无兴趣-低价值-沉默 |
10003 | 0.0 | 1.0 | 0.0 | 无兴趣-高价值-沉默 |
10004 | 1.0 | 1.0 | 0.0 | 有兴趣-高价值-沉默 |
10005 | 0.0 | 0.0 | 0.0 | 无兴趣-低价值-沉默 |