用数据分析细分用户:RFM分析 用户数据细分
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import seaborn as sns
import matplotlib. pyplot as plt
from pylab import *
mpl. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
from warnings import filterwarnings
filterwarnings( 'ignore' )
path= '../RFM模型/data.csv'
df= pd. read_csv( path)
df. head( )
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country 0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom 1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom 3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom 4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
数据清洗
去除重复数据
df= df. drop_duplicates( )
处理异常数据
1.快速查看统计信息
df. describe( )
Quantity UnitPrice CustomerID count 536641.000000 536641.000000 401604.000000 mean 9.620029 4.632656 15281.160818 std 219.130156 97.233118 1714.006089 min -80995.000000 -11062.060000 12346.000000 25% 1.000000 1.250000 13939.000000 50% 3.000000 2.080000 15145.000000 75% 10.000000 4.130000 16784.000000 max 80995.000000 38970.000000 18287.000000
df. loc[ df[ 'UnitPrice' ] < 0 ] . UnitPrice. count( )
2
df. loc[ df[ 'UnitPrice' ] < 0 , [ 'UnitPrice' , 'Description' ] ]
UnitPrice Description 299983 -11062.06 Adjust bad debt 299984 -11062.06 Adjust bad debt
df= df[ df[ 'UnitPrice' ] >= 0 ]
2、统计缺失值
df. isnull( ) . sum ( )
InvoiceNo 0
StockCode 0
Description 1454
Quantity 0
InvoiceDate 0
UnitPrice 0
CustomerID 135035
Country 0
dtype: int64
df. isnull( ) . sum ( ) / df. shape[ 0 ] * 100
InvoiceNo 0.000000
StockCode 0.000000
Description 0.270946
Quantity 0.000000
InvoiceDate 0.000000
UnitPrice 0.000000
CustomerID 25.163098
Country 0.000000
dtype: float64
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 536639 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 536639 non-null object
1 StockCode 536639 non-null object
2 Description 535185 non-null object
3 Quantity 536639 non-null int64
4 InvoiceDate 536639 non-null object
5 UnitPrice 536639 non-null float64
6 CustomerID 401604 non-null float64
7 Country 536639 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 36.8+ MB
df= df[ ~ ( df. CustomerID. isnull( ) ) ]
df[ 'InvoiceDate' ] = pd. to_datetime( df[ 'InvoiceDate' ] )
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 401604 entries, 0 to 541908
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 InvoiceNo 401604 non-null object
1 StockCode 401604 non-null object
2 Description 401604 non-null object
3 Quantity 401604 non-null int64
4 InvoiceDate 401604 non-null datetime64[ns]
5 UnitPrice 401604 non-null float64
6 CustomerID 401604 non-null float64
7 Country 401604 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.6+ MB
print ( '最大日期是:' , df[ 'InvoiceDate' ] . max ( ) )
print ( '最小日期是:' , df[ 'InvoiceDate' ] . min ( ) )
最大日期是: 2011-12-09 12:50:00
最小日期是: 2010-12-01 08:26:00
RFM模型
import datetime
df[ 'Sales' ] = df[ 'Quantity' ] * df[ 'UnitPrice' ]
groupby是分组,agg是用来聚合。agg用法 下面的sum,count,max都是groupby后数据的内置函数,可以直接用。sum表示求和,count表示计数,max表示求最大值。
df_group= df. groupby( 'CustomerID' )
df_rfm= df_group. agg( { 'Sales' : 'sum' , 'Quantity' : 'count' , 'InvoiceDate' : 'max' } )
dt.days:可以直接得到天数(注意这里减法是带时分秒的)。
df_rfm[ 'DateDiff' ] = ( pd. to_datetime( '2012-01-01' ) - df_rfm[ 'InvoiceDate' ] ) . dt. days
df_rfm= df_rfm. drop( 'InvoiceDate' , axis= 1 )
df_rfm. head( )
Sales Quantity DateDiff CustomerID 12346.0 0.00 2 347 12347.0 4310.00 182 24 12348.0 1797.24 31 97 12349.0 1757.55 73 40 12350.0 334.40 17 332
rmd = df_rfm[ 'DateDiff' ] . median( )
fmd = df_rfm[ 'Quantity' ] . median( )
mmd = df_rfm[ 'Sales' ] . median( )
rmd, fmd, mmd
(72.0, 41.0, 644.0700000000002)
对8类用户进行定义(中位数):与最近一次购物到现在的时间间隔rmd,最近一段时间内的购物频次fmd,最近一段时间内的购物花费总额mmd分别进行比较,将满足不同条件的用户归为不同的类别。
def customer_type ( frame) :
customer_type = [ ]
for i in range ( len ( frame) ) :
if frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要价值用户' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要唤回用户' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要深耕用户' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] >= mmd:
customer_type. append( '重要挽留用户' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '潜力用户' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] >= fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '一般维持用户' )
elif frame. iloc[ i, 2 ] <= rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '新用户' )
elif frame. iloc[ i, 2 ] > rmd and frame. iloc[ i, 1 ] < fmd and frame. iloc[ i, 0 ] < mmd:
customer_type. append( '流失用户' )
frame[ 'classification' ] = customer_type
customer_type( df_rfm)
df_rfm. groupby( by= 'classification' ) . size( )
classification
一般维持用户 184
新用户 524
流失用户 1276
潜力用户 202
重要价值用户 1337
重要唤回用户 480
重要挽留用户 209
重要深耕用户 160
dtype: int64
fig, ax = plt. subplots( figsize= ( 12 , 8 ) )
sns. countplot( y= "classification" , order= df_rfm[ 'classification' ] . value_counts( ) . index , data= df_rfm, color= '#3c7f99' )
plt. box( False )
fig. text( x= 0.04 , y= 0.90 , s= ' 不同价值的客户数量 ' ,
fontsize= 20 , weight= 'bold' )
plt. tick_params( axis= 'both' , which= 'major' , labelsize= 14 )
ax. xaxis. grid( which= 'both' , linewidth= 0.5 , color= '#3c7f99' )
plt. xlabel( '' )
plt. ylabel( '' )
con= list ( df_rfm. groupby( 'classification' ) . classification. count( ) . values)
con= sorted ( con, reverse= True )
for x, y in enumerate ( con) :
plt. text( y+ 0.1 , x, '%s' % y, va= 'center' , size= 14 )
plt. show( )