供自己查阅。
1. 导入数据
import pandas as pd
trad_flow = pd. read_csv( r'.\data\RFM_TRAD_FLOW.csv' , encoding= 'gbk' )
trad_flow. head( 10 )
transID cumid time amount type_label type 0 9407 10001 14JUN09:17:58:34 199.0 正常 Normal 1 9625 10001 16JUN09:15:09:13 369.0 正常 Normal 2 11837 10001 01JUL09:14:50:36 369.0 正常 Normal 3 26629 10001 14DEC09:18:05:32 359.0 正常 Normal 4 30850 10001 12APR10:13:02:20 399.0 正常 Normal 5 32007 10001 04MAY10:16:45:58 269.0 正常 Normal 6 36637 10001 04JUN10:20:03:06 0.0 赠送 Presented 7 43108 10001 06JUL10:16:56:40 381.0 正常 Normal 8 43877 10001 10JUL10:20:41:54 -399.0 退货 returned_goods 9 46081 10001 23JUL10:16:35:45 0.0 赠送 Presented
完整程序
F = trad_flow. groupby( [ 'cumid' , 'type' ] ) [ [ 'transID' ] ] . count( )
F_trad = pd. pivot_table( F, index = 'cumid' , columns= 'type' , values= 'transID' )
F_trad[ 'Special_offer' ] = F_trad[ 'Special_offer' ] . fillna( 0 )
F_trad[ 'interest' ] = F_trad[ 'Special_offer' ] / ( F_trad[ 'Special_offer' ] + F_trad[ 'Normal' ] )
M = trad_flow. groupby( [ 'cumid' , 'type' ] ) [ [ 'amount' ] ] . sum ( )
M_trad = pd. pivot_table( M, index = 'cumid' , columns= 'type' , values= 'amount' )
M_trad[ 'Special_offer' ] = M_trad[ 'Special_offer' ] . fillna( 0 )
M_trad[ 'returned_goods' ] = M_trad[ 'returned_goods' ] . fillna( 0 )
M_trad[ 'Special_offer' ] = M_trad[ 'Special_offer' ] . fillna( 0 )
M_trad[ "value" ] = M_trad[ 'Normal' ] + M_trad[ 'Special_offer' ] + M_trad[ 'returned_goods' ]
from datetime import datetime
import time
def to_time ( t) :
out_time = time. mktime( time. strptime( t, '%d%b%y:%H:%M:%S' ) )
return out_time
to_time( trad_flow[ 'time' ] . loc[ 0 ] )
trad_flow[ 'new_time' ] = trad_flow. time. apply ( to_time)
N = trad_flow. groupby( [ 'cumid' ] ) [ [ 'new_time' ] ] . max ( )
N[ 'new_time' ] = N[ 'new_time' ] . fillna( 0 )
threshold = pd. qcut( F_trad[ 'interest' ] , 2 , retbins= True ) [ 1 ] [ 1 ]
binarizer = preprocessing. Binarizer( threshold= threshold)
interest_q = pd. DataFrame( binarizer. transform( F_trad[ 'interest' ] . values. reshape( - 1 , 1 ) ) )
interest_q. index= F_trad. index
interest_q. columns= [ "interest" ]
threshold = pd. qcut( M_trad[ 'value' ] , 2 , retbins= True ) [ 1 ] [ 1 ]
binarizer = preprocessing. Binarizer( threshold= threshold)
value_q = pd. DataFrame( binarizer. transform( M_trad[ 'value' ] . values. reshape( - 1 , 1 ) ) )
value_q. index= M_trad. index
value_q. columns= [ 'value' ]
threshold = pd. qcut( N[ "new_time" ] , 2 , retbins= True ) [ 1 ] [ 1 ]
binarizer = preprocessing. Binarizer( threshold= threshold)
time_new_q = pd. DataFrame( binarizer. transform( N[ "new_time" ] . values. reshape( - 1 , 1 ) ) )
time_new_q. index= N. index
time_new_q. columns= [ "time" ]
analysis= pd. concat( [ interest_q, value_q, time_new_q] , axis= 1 )
analysis = analysis[ [ 'interest' , 'value' , 'time' ] ]
label = {
( 0 , 0 , 0 ) : '无兴趣-低价值-沉默' ,
( 1 , 0 , 0 ) : '有兴趣-低价值-沉默' ,
( 1 , 0 , 1 ) : '有兴趣-低价值-活跃' ,
( 0 , 0 , 1 ) : '无兴趣-低价值-活跃' ,
( 0 , 1 , 0 ) : '无兴趣-高价值-沉默' ,
( 1 , 1 , 0 ) : '有兴趣-高价值-沉默' ,
( 1 , 1 , 1 ) : '有兴趣-高价值-活跃' ,
( 0 , 1 , 1 ) : '无兴趣-高价值-活跃'
}
analysis[ 'label' ] = analysis. apply ( lambda x: label[ ( x[ 0 ] , x[ 1 ] , x[ 2 ] ) ] , axis = 1 )
analysis. head( )
interest value time label cumid 10001 1.0 1.0 1.0 有兴趣-高价值-活跃 10002 0.0 0.0 0.0 无兴趣-低价值-沉默 10003 0.0 1.0 0.0 无兴趣-高价值-沉默 10004 1.0 1.0 0.0 有兴趣-高价值-沉默 10005 0.0 0.0 0.0 无兴趣-低价值-沉默
a = lambda x: label[ ( x[ 0 ] , x[ 1 ] , x[ 2 ] ) ]
a( ( 1 , 1 , 1 ) )
'有兴趣-高价值-活跃'
threshold[ 1 ] [ 1 ]
2779.0
F_trad[ 'interst' ] . values. reshape( - 1 , 1 )
array([[0.11764706],
[0. ],
[0.0625 ],
...,
[0.28571429],
[0.05882353],
[0.09090909]])
分步理解:
2.通过 RFM方法 建立模型
2.1 通过计算F反应客户对打折产品的偏好
F= trad_flow. groupby( [ 'cumid' , 'type' ] ) [ [ 'transID' ] ] . count( )
F. head( )
transID cumid type 10001 Normal 15 Presented 8 Special_offer 2 returned_goods 2 10002 Normal 12
F_trans= pd. pivot_table( F, index= 'cumid' , columns= 'type' , values= 'transID' )
F_trans. head( )
type Normal Presented Special_offer returned_goods cumid 10001 15.0 8.0 2.0 2.0 10002 12.0 5.0 NaN 1.0 10003 15.0 8.0 1.0 1.0 10004 15.0 12.0 2.0 1.0 10005 8.0 5.0 NaN 1.0
F_trans[ 'Special_offer' ] = F_trans[ 'Special_offer' ] . fillna( 0 )
F_trans. head( )
type Normal Presented Special_offer returned_goods cumid 10001 15.0 8.0 2.0 2.0 10002 12.0 5.0 0.0 1.0 10003 15.0 8.0 1.0 1.0 10004 15.0 12.0 2.0 1.0 10005 8.0 5.0 0.0 1.0
F_trans[ "interest" ] = F_trans[ 'Special_offer' ] / ( F_trans[ 'Special_offer' ] + F_trans[ 'Normal' ] )
F_trans. head( )
type Normal Presented Special_offer returned_goods interest cumid 10001 15.0 8.0 2.0 2.0 0.117647 10002 12.0 5.0 0.0 1.0 0.000000 10003 15.0 8.0 1.0 1.0 0.062500 10004 15.0 12.0 2.0 1.0 0.117647 10005 8.0 5.0 0.0 1.0 0.000000
2.2 通过计算M反应客户的价值信息
M= trad_flow. groupby( [ 'cumid' , 'type' ] ) [ [ 'amount' ] ] . sum ( )
M. head( )
amount cumid type 10001 Normal 3608.0 Presented 0.0 Special_offer 420.0 returned_goods -694.0 10002 Normal 1894.0
M_trans= pd. pivot_table( M, index= 'cumid' , columns= 'type' , values= 'amount' )
M_trans[ 'Special_offer' ] = M_trans[ 'Special_offer' ] . fillna( 0 )
M_trans[ 'returned_goods' ] = M_trans[ 'returned_goods' ] . fillna( 0 )
M_trans[ "value" ] = M_trans[ 'Normal' ] + M_trans[ 'Special_offer' ] + M_trans[ 'returned_goods' ]
M_trans. head( )
type Normal Presented Special_offer returned_goods value cumid 10001 3608.0 0.0 420.0 -694.0 3334.0 10002 1894.0 0.0 0.0 -242.0 1652.0 10003 3503.0 0.0 156.0 -224.0 3435.0 10004 2979.0 0.0 373.0 -40.0 3312.0 10005 2368.0 0.0 0.0 -249.0 2119.0
2.3 通过计算R反应客户是否为沉默客户
from datetime import datetime
import time
def to_time ( t) :
out_t= time. mktime( time. strptime( t, '%d%b%y:%H:%M:%S' ) )
return out_t
a= "14JUN09:17:58:34"
print ( to_time( a) )
1244973514.0
trad_flow[ "time_new" ] = trad_flow. time. apply ( to_time)
trad_flow. head( )
transID cumid time amount type_label type new_time time_new 0 9407 10001 14JUN09:17:58:34 199.0 正常 Normal 1.244974e+09 1.244974e+09 1 9625 10001 16JUN09:15:09:13 369.0 正常 Normal 1.245136e+09 1.245136e+09 2 11837 10001 01JUL09:14:50:36 369.0 正常 Normal 1.246431e+09 1.246431e+09 3 26629 10001 14DEC09:18:05:32 359.0 正常 Normal 1.260785e+09 1.260785e+09 4 30850 10001 12APR10:13:02:20 399.0 正常 Normal 1.271049e+09 1.271049e+09
R= trad_flow. groupby( [ 'cumid' ] ) [ [ 'time_new' ] ] . max ( )
R. head( )
time_new cumid 10001 1.284699e+09 10002 1.278129e+09 10003 1.282983e+09 10004 1.283057e+09 10005 1.282127e+09
3.构建模型,筛选目标客户
Pandas —— qcut( )与cut( )的区别
from sklearn import preprocessing
threshold = pd. qcut( F_trans[ 'interest' ] , 2 , retbins= True ) [ 1 ] [ 1 ]
信息二分
binarizer = preprocessing. Binarizer( threshold= threshold)
interest_q = pd. DataFrame( binarizer. transform( F_trans[ 'interest' ] . values. reshape( - 1 , 1 ) ) )
interest_q. index= F_trans. index
interest_q. columns= [ "interest" ]
threshold = pd. qcut( M_trans[ 'value' ] , 2 , retbins= True ) [ 1 ] [ 1 ]
binarizer = preprocessing. Binarizer( threshold= threshold)
value_q = pd. DataFrame( binarizer. transform( M_trans[ 'value' ] . values. reshape( - 1 , 1 ) ) )
value_q. index= M_trans. index
value_q. columns= [ "value" ]
threshold = pd. qcut( R[ "time_new" ] , 2 , retbins= True ) [ 1 ] [ 1 ]
binarizer = preprocessing. Binarizer( threshold= threshold)
time_new_q = pd. DataFrame( binarizer. transform( R[ "time_new" ] . values. reshape( - 1 , 1 ) ) )
time_new_q. index= R. index
time_new_q. columns= [ "time" ]
analysis= pd. concat( [ interest_q, value_q, time_new_q] , axis= 1 )
analysis = analysis[ [ 'interest' , 'value' , 'time' ] ]
analysis. head( )
label = {
( 0 , 0 , 0 ) : '无兴趣-低价值-沉默' ,
( 1 , 0 , 0 ) : '有兴趣-低价值-沉默' ,
( 1 , 0 , 1 ) : '有兴趣-低价值-活跃' ,
( 0 , 0 , 1 ) : '无兴趣-低价值-活跃' ,
( 0 , 1 , 0 ) : '无兴趣-高价值-沉默' ,
( 1 , 1 , 0 ) : '有兴趣-高价值-沉默' ,
( 1 , 1 , 1 ) : '有兴趣-高价值-活跃' ,
( 0 , 1 , 1 ) : '无兴趣-高价值-活跃'
}
analysis[ 'label' ] = analysis[ [ 'interest' , 'value' , 'time' ] ] . apply ( lambda x: label[ ( x[ 0 ] , x[ 1 ] , x[ 2 ] ) ] , axis = 1 )
analysis. head( )
interest value time label cumid 10001 1.0 1.0 1.0 有兴趣-高价值-活跃 10002 0.0 0.0 0.0 无兴趣-低价值-沉默 10003 0.0 1.0 0.0 无兴趣-高价值-沉默 10004 1.0 1.0 0.0 有兴趣-高价值-沉默 10005 0.0 0.0 0.0 无兴趣-低价值-沉默