利用python建立用户RFM模型

用数据分析细分用户:RFM分析
用户数据细分

# 加载必要的库
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
from warnings import filterwarnings
filterwarnings('ignore') 
# 导入数据
path='../RFM模型/data.csv'
df=pd.read_csv(path)
df.head()
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER612/1/2010 8:262.5517850.0United Kingdom
153636571053WHITE METAL LANTERN612/1/2010 8:263.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER812/1/2010 8:262.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE612/1/2010 8:263.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.612/1/2010 8:263.3917850.0United Kingdom

数据清洗

去除重复数据

df=df.drop_duplicates()

处理异常数据

1.快速查看统计信息

df.describe()
QuantityUnitPriceCustomerID
count536641.000000536641.000000401604.000000
mean9.6200294.63265615281.160818
std219.13015697.2331181714.006089
min-80995.000000-11062.06000012346.000000
25%1.0000001.25000013939.000000
50%3.0000002.08000015145.000000
75%10.0000004.13000016784.000000
max80995.00000038970.00000018287.000000
#统计UnitPrice有多少异常的
df.loc[df['UnitPrice']<0].UnitPrice.count()
2
# 查看这2行的Description是什么
df.loc[df['UnitPrice']<0,['UnitPrice','Description']]
UnitPriceDescription
299983-11062.06Adjust bad debt
299984-11062.06Adjust bad debt
# 删除UnitPrice小于0的的数据
df=df[df['UnitPrice']>=0]

2、统计缺失值

# 统计缺失值
df.isnull().sum()
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135035
Country             0
dtype: int64
# 统计缺失值的占比
df.isnull().sum()/df.shape[0]*100
InvoiceNo       0.000000
StockCode       0.000000
Description     0.270946
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     25.163098
Country         0.000000
dtype: float64
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    536639 non-null  object 
 1   StockCode    536639 non-null  object 
 2   Description  535185 non-null  object 
 3   Quantity     536639 non-null  int64  
 4   InvoiceDate  536639 non-null  object 
 5   UnitPrice    536639 non-null  float64
 6   CustomerID   401604 non-null  float64
 7   Country      536639 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB
# 删除CustomerID为空的数据
df=df[~(df.CustomerID.isnull())]
# 把InvoiceDate转换为datetime类型
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    401604 non-null  object        
 1   StockCode    401604 non-null  object        
 2   Description  401604 non-null  object        
 3   Quantity     401604 non-null  int64         
 4   InvoiceDate  401604 non-null  datetime64[ns]
 5   UnitPrice    401604 non-null  float64       
 6   CustomerID   401604 non-null  float64       
 7   Country      401604 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB
# 查看数据日期区间
print('最大日期是:',df['InvoiceDate'].max())
print('最小日期是:',df['InvoiceDate'].min())
最大日期是: 2011-12-09 12:50:00
最小日期是: 2010-12-01 08:26:00

RFM模型

import datetime
# 添加一列Sales
df['Sales']=df['Quantity']*df['UnitPrice']

groupby是分组,agg是用来聚合。agg用法
下面的sum,count,max都是groupby后数据的内置函数,可以直接用。sum表示求和,count表示计数,max表示求最大值。

# 按用户ID进行分组
df_group=df.groupby('CustomerID')
df_rfm=df_group.agg({'Sales':'sum','Quantity':'count','InvoiceDate':'max'})

dt.days:可以直接得到天数(注意这里减法是带时分秒的)。

df_rfm['DateDiff']=(pd.to_datetime('2012-01-01') - df_rfm['InvoiceDate']).dt.days

df_rfm=df_rfm.drop('InvoiceDate',axis=1)
df_rfm.head()
SalesQuantityDateDiff
CustomerID
12346.00.002347
12347.04310.0018224
12348.01797.243197
12349.01757.557340
12350.0334.4017332
rmd = df_rfm['DateDiff'].median()
fmd = df_rfm['Quantity'].median()
mmd = df_rfm['Sales'].median()
rmd,fmd,mmd
(72.0, 41.0, 644.0700000000002)

对8类用户进行定义(中位数):与最近一次购物到现在的时间间隔rmd,最近一段时间内的购物频次fmd,最近一段时间内的购物花费总额mmd分别进行比较,将满足不同条件的用户归为不同的类别。

def customer_type(frame): 
    customer_type = []
    for i in range(len(frame)):
        if frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要价值用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要唤回用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要深耕用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要挽留用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('潜力用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('一般维持用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('新用户')
        elif frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('流失用户')
    frame['classification'] = customer_type
customer_type(df_rfm)
df_rfm.groupby(by='classification').size()
classification
一般维持用户     184
新用户        524
流失用户      1276
潜力用户       202
重要价值用户    1337
重要唤回用户     480
重要挽留用户     209
重要深耕用户     160
dtype: int64
# 绘制条形图
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot(y="classification",order=df_rfm['classification'].value_counts().index ,data=df_rfm,color='#3c7f99')
plt.box(False) 
fig.text(x=0.04, y=0.90, s='                                  不同价值的客户数量                       ', 
         fontsize=20, weight='bold')
plt.tick_params(axis='both', which='major', labelsize=14)
ax.xaxis.grid(which='both', linewidth=0.5, color='#3c7f99')
plt.xlabel('')
plt.ylabel('')

con=list(df_rfm.groupby('classification').classification.count().values)
con=sorted(con,reverse=True)

for x,y in enumerate(con):
    plt.text(y+0.1,x,'%s' %y,va='center',size=14)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-oAujWQcC-1590740001414)(output_30_0.png)]

  • 1
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值