利用python建立用户RFM模型

最新推荐文章于 2024-01-09 00:01:50 发布

静听山水

最新推荐文章于 2024-01-09 00:01:50 发布

阅读量1.2k

点赞数 1

分类专栏：数据分析——业务知识文章标签： python

原文链接：https://www.kesci.com/home/project/5ecfad89162df90036decd72

版权

数据分析——业务知识专栏收录该内容

29 篇文章 10 订阅

订阅专栏

用数据分析细分用户：RFM分析
 用户数据细分

# 加载必要的库
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
from warnings import filterwarnings
filterwarnings('ignore')

# 导入数据
path='../RFM模型/data.csv'
df=pd.read_csv(path)

df.head()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	12/1/2010 8:26	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	12/1/2010 8:26	3.39	17850.0	United Kingdom

数据清洗

去除重复数据

df=df.drop_duplicates()

处理异常数据

1.快速查看统计信息

df.describe()

	Quantity	UnitPrice	CustomerID
count	536641.000000	536641.000000	401604.000000
mean	9.620029	4.632656	15281.160818
std	219.130156	97.233118	1714.006089
min	-80995.000000	-11062.060000	12346.000000
25%	1.000000	1.250000	13939.000000
50%	3.000000	2.080000	15145.000000
75%	10.000000	4.130000	16784.000000
max	80995.000000	38970.000000	18287.000000

#统计UnitPrice有多少异常的
df.loc[df['UnitPrice']<0].UnitPrice.count()

# 查看这2行的Description是什么
df.loc[df['UnitPrice']<0,['UnitPrice','Description']]

	UnitPrice	Description
299983	-11062.06	Adjust bad debt
299984	-11062.06	Adjust bad debt

# 删除UnitPrice小于0的的数据
df=df[df['UnitPrice']>=0]

2、统计缺失值

# 统计缺失值
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135035
Country             0
dtype: int64

# 统计缺失值的占比
df.isnull().sum()/df.shape[0]*100

InvoiceNo       0.000000
StockCode       0.000000
Description     0.270946
Quantity        0.000000
InvoiceDate     0.000000
UnitPrice       0.000000
CustomerID     25.163098
Country         0.000000
dtype: float64

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    536639 non-null  object 
 1   StockCode    536639 non-null  object 
 2   Description  535185 non-null  object 
 3   Quantity     536639 non-null  int64  
 4   InvoiceDate  536639 non-null  object 
 5   UnitPrice    536639 non-null  float64
 6   CustomerID   401604 non-null  float64
 7   Country      536639 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB

# 删除CustomerID为空的数据
df=df[~(df.CustomerID.isnull())]

# 把InvoiceDate转换为datetime类型
df['InvoiceDate']=pd.to_datetime(df['InvoiceDate'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    401604 non-null  object        
 1   StockCode    401604 non-null  object        
 2   Description  401604 non-null  object        
 3   Quantity     401604 non-null  int64         
 4   InvoiceDate  401604 non-null  datetime64[ns]
 5   UnitPrice    401604 non-null  float64       
 6   CustomerID   401604 non-null  float64       
 7   Country      401604 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB

# 查看数据日期区间
print('最大日期是：',df['InvoiceDate'].max())
print('最小日期是：',df['InvoiceDate'].min())

最大日期是： 2011-12-09 12:50:00
最小日期是： 2010-12-01 08:26:00

RFM模型

import datetime

# 添加一列Sales
df['Sales']=df['Quantity']*df['UnitPrice']

groupby是分组，agg是用来聚合。agg用法
下面的sum，count，max都是groupby后数据的内置函数，可以直接用。sum表示求和，count表示计数，max表示求最大值。

# 按用户ID进行分组
df_group=df.groupby('CustomerID')
df_rfm=df_group.agg({'Sales':'sum','Quantity':'count','InvoiceDate':'max'})

dt.days：可以直接得到天数（注意这里减法是带时分秒的）。

df_rfm['DateDiff']=(pd.to_datetime('2012-01-01') - df_rfm['InvoiceDate']).dt.days

df_rfm=df_rfm.drop('InvoiceDate',axis=1)
df_rfm.head()

	Sales	Quantity	DateDiff
CustomerID
12346.0	0.00	2	347
12347.0	4310.00	182	24
12348.0	1797.24	31	97
12349.0	1757.55	73	40
12350.0	334.40	17	332

rmd = df_rfm['DateDiff'].median()
fmd = df_rfm['Quantity'].median()
mmd = df_rfm['Sales'].median()
rmd,fmd,mmd

(72.0, 41.0, 644.0700000000002)

对8类用户进行定义（中位数）：与最近一次购物到现在的时间间隔rmd，最近一段时间内的购物频次fmd，最近一段时间内的购物花费总额mmd分别进行比较，将满足不同条件的用户归为不同的类别。

def customer_type(frame): 
    customer_type = []
    for i in range(len(frame)):
        if frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要价值用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要唤回用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要深耕用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]>=mmd:
            customer_type.append('重要挽留用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('潜力用户')
        elif  frame.iloc[i,2]>rmd and frame.iloc[i,1]>=fmd and frame.iloc[i,0]<mmd:
            customer_type.append('一般维持用户')
        elif  frame.iloc[i,2]<=rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('新用户')
        elif frame.iloc[i,2]>rmd and frame.iloc[i,1]<fmd and frame.iloc[i,0]<mmd:
            customer_type.append('流失用户')
    frame['classification'] = customer_type

customer_type(df_rfm)
df_rfm.groupby(by='classification').size()

classification
一般维持用户     184
新用户        524
流失用户      1276
潜力用户       202
重要价值用户    1337
重要唤回用户     480
重要挽留用户     209
重要深耕用户     160
dtype: int64

# 绘制条形图
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot(y="classification",order=df_rfm['classification'].value_counts().index ,data=df_rfm,color='#3c7f99')
plt.box(False) 
fig.text(x=0.04, y=0.90, s='                                  不同价值的客户数量                       ', 
         fontsize=20, weight='bold')
plt.tick_params(axis='both', which='major', labelsize=14)
ax.xaxis.grid(which='both', linewidth=0.5, color='#3c7f99')
plt.xlabel('')
plt.ylabel('')

con=list(df_rfm.groupby('classification').classification.count().values)
con=sorted(con,reverse=True)

for x,y in enumerate(con):
    plt.text(y+0.1,x,'%s' %y,va='center',size=14)
plt.show()