1 说明
- 依赖库:time、numpy、pandas、sklearn、pyecharts
- 程序输入:sales.xlsx
- 程序输出:RFM得分数据写本地文件sales_rfm_score.xlsx和数据表(sales_rfm_score)
2 导入相关库
import time
import numpy as np
import pandas as pd
import pymysql
from sklearn.ensemble import RandomForestClassifier
2 读取数据
sheet_names = ['2016','2017','2018','2019','会员等级']
sheet_datas = [pd.read_excel('sales.xlsx',sheet_name=i) for i in sheet_names]
print(type(sheet_datas))
3 数据审查
for each_name,each_data in zip(sheet_names,sheet_datas):
print('[data summary for {0:=^50}]'.format(each_name))
print('Overview:','\n',each_data.head(4))
print('DESC:','\n',each_data.describe())
print('NALL records',each_data.isnull().any(axis=1).sum())
print('Datas_types',each_data.dtypes)
4 数据预处理
for ind,each_data in enumerate(sheet_datas[:-1]):
sheet_datas[ind] = each_data.dropna()
sheet_datas[ind] = each_data[each_data['订单金额'] > 1]
sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max()
data_merge = pd.concat(sheet_datas[:-1],axis=0)
data_merge['date_interval'] = data_merge['max_year_date']-data_merge['提交日期']
data_merge['year'] = data_merge['提交日期'].dt.year
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x: x.days)
data_merge.head()
rfm_gb = data_merge.groupby(['year','会员ID'],as_index=False)
.agg({'date_interval': 'min',
'提交日期': 'count',
'订单金额': 'sum'})
rfm_gb.columns = ['year','会员ID','r','f','m']
rfm_gb.head()
5 确定RFM划分区间
desc_pd = rfm_gb.iloc[:,2:].describe().T
print(desc_pd)
r_bins = [-1,79,255,365]
f_bins = [0,2,5,130]
m_bins = [0,69,1199,206252]
6 计算RFM因子权重
rfm_merge = pd.merge(rfm_gb,sheet_datas[-1],on='会员ID',how='inner')
rfm_merge['会员等级'].min()
clf = RandomForestClassifier()
clf = clf.fit(rfm_merge[['r','f','m']],rfm_merge['会员等级'])
weights = clf.feature_importances_
print('feature importance:',weights)
7 RFM计算过程
rfm_gb['r_score'] = pd.cut(rfm_gb['r'], r_bins, labels=[i for i in range(len(r_bins)-1,0,-1)])
rfm_gb['f_score'] = pd.cut(rfm_gb['f'], f_bins, labels=[i+1 for i in range(len(f_bins)-1)])
rfm_gb['m_score'] = pd.cut(rfm_gb['m'], m_bins, labels=[i+1 for i in range(len(m_bins)-1)])
rfm_gb = rfm_gb.apply(np.int32)
rfm_gb['rfm_score'] = rfm_gb['r_score'] * weights[0] + rfm_gb['f_score'] * weights[1] + rfm_gb[
'm_score'] * weights[2]
rfm_gb = rfm_gb.apply(np.int32)
rfm_gb['rfm_score'] = rfm_gb['r_score'] * weights[0] + rfm_gb['f_score'] * weights[1] + rfm_gb[
'm_score'] * weights[2]
rfm_gb['r_score'] = rfm_gb['r_score'].astype(np.str)
rfm_gb['f_score'] = rfm_gb['f_score'].astype(np.str)
rfm_gb['m_score'] = rfm_gb['m_score'].astype(np.str)
rfm_gb['rfm_group'] = rfm_gb['r_score'].str.cat(rfm_gb['f_score']).str.cat(
rfm_gb['m_score'])
8 保存RFM结果到Excel
rfm_gb.to_excel('sales_rfm_score.xlsx')
9 写数据到数据库
write_db_data = rfm_gb[['会员ID','r_score','f_score','m_score','rfm_score','rfm_group']]
timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))
for each_value in write_db_data.values:
insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % \
(table_name, each_value[0], each_value[1], each_value[2], \
each_value[3],each_value[4],each_value[5],
timestamp)
cursor.execute(insert_sql)
con.commit()
cursor.close()
con.close()