import pandas as pd
1. 导入数据
In [75]:
trad_flow = pd.read_csv('D:\python\Script\RFM_TRAD_FLOW.csv',encoding='gbk') #编码格式需要进行修改
trad_flow.head() #默认前五行
Out[75]:
transID | cumid | time | amount | type_label | type | |
---|---|---|---|---|---|---|
0 | 9407 | 10001 | 14JUN09:17:58:34 | 199.0 | 正常 | Normal |
1 | 9625 | 10001 | 16JUN09:15:09:13 | 369.0 | 正常 | Normal |
2 | 11837 | 10001 | 01JUL09:14:50:36 | 369.0 | 正常 | Normal |
3 | 26629 | 10001 | 14DEC09:18:05:32 | 359.0 | 正常 | Normal |
4 | 30850 | 10001 | 12APR10:13:02:20 | 399.0 | 正常 | Normal |
2.计算 RFM
In [21]:
M = trad_flow.groupby(['cumid','type'])[['amount']].sum()
In [48]:
trains_M = pd.pivot_table(M,index='cumid',columns='type',values='amount')
trains_M.head()
Out[48]:
type | Normal | Presented | Special_offer | returned_goods |
---|---|---|---|---|
cumid | ||||
10001 | 3608.0 | 0.0 | 420.0 | -694.0 |
10002 | 1894.0 | 0.0 | NaN | -242.0 |
10003 | 3503.0 | 0.0 | 156.0 | -224.0 |
10004 | 2979.0 | 0.0 | 373.0 | -40.0 |
10005 | 2368.0 | 0.0 | NaN | -249.0 |
In [47]:
F = trad_flow.groupby(['cumid','type'])[['transID']].count()
F.head()
Out[47]:
transID | ||
---|---|---|
cumid | type | |
10001 | Normal | 15 |
Presented | 8 | |
Special_offer | 2 | |
returned_goods | 2 | |
10002 | Normal | 12 |
In [46]:
R = trad_flow.groupby(['cumid','type'])[['time']].max()
R.head()
Out[46]:
time | ||
---|---|---|
cumid | type | |
10001 | Normal | 21JUL09:09:31:26 |
Presented | 31MAR10:20:29:48 | |
Special_offer | 12OCT09:10:59:13 | |
returned_goods | 10JUL10:20:41:54 | |
10002 | Normal | 29JUL09:19:21:41 |
3.衡量客户对打折商品的偏好
In [53]:
trains_M['Special_offer'] = trains_M['Special_offer'].fillna(0)
trains_M['Special_offer'].head()
Out[53]:
cumid
10001 420.0
10002 0.0
10003 156.0
10004 373.0
10005 0.0
Name: Special_offer, dtype: float64
In [67]:
trains_M['spe_ratio'] = trains_M['Special_offer']/(trains_M['Special_offer']+trains_M['Normal'])
trains_M['spe_ratio'].head()
Out[67]:
cumid
10001 0.104270
10002 0.000000
10003 0.042635
10004 0.111277
10005 0.000000
Name: spe_ratio, dtype: float64
In [68]:
M_rank = trains_M.sort_values('spe_ratio',ascending=False).head()
M_rank.head()
Out[68]:
type | Normal | Presented | Special_offer | returned_goods | spe_ratio |
---|---|---|---|---|---|
cumid | |||||
10151 | 765.0 | 0.0 | 870.0 | NaN | 0.532110 |
40033 | 1206.0 | 0.0 | 761.0 | -848.0 | 0.386884 |
40236 | 1155.0 | 0.0 | 691.0 | -793.0 | 0.374323 |
30225 | 1475.0 | 0.0 | 738.0 | -301.0 | 0.333484 |
20068 | 1631.0 | 0.0 | 731.0 | -239.0 | 0.309483 |
In [74]:
pd.qcut(M_rank['spe_ratio'],4)
Out[74]:
cumid
10151 (0.387, 0.532]
40033 (0.374, 0.387]
40236 (0.333, 0.374]
30225 (0.308, 0.333]
20068 (0.308, 0.333]
Name: spe_ratio, dtype: category
Categories (4, interval[float64]): [(0.308, 0.333] < (0.333, 0.374] < (0.374, 0.387] < (0.387, 0.532]]
相关:
【机器学习杂烩篇】详解 Pandas 透视表(pivot_table)
数据链接:https://pan.baidu.com/s/1Qet7LWD89jVGBQK9QUQnDg