1.导入库
import time
import numpy as np
import pandas as pd
import pymysql
from sklearn.ensemble import RandomForestClassifier
from pyecharts.charts import Bar3D
from pyecharts import options as opts
2.读取数据
sheet_names = ['2015','2016','2017','2018','会员等级']
sheet_datas = [pd.read_excel('sales.xlsx',sheet_name=i) for i in sheet_names]
3.数据审查
for each_name,each_data in zip(sheet_names,sheet_datas):
print('[data summary for {0:=^50}]'.format(each_name))
print('Overview:','\n',each_data.head(4))
print('DESC:','\n',each_data.describe())
print('NA records',each_data.isnull().any(axis=1).sum())
print('Dtypes',each_data.dtypes)
查看前4条数据,查看描述性统计结果,判断记录缺失值,查看字段数据类型
4.数据预处理
1)去除缺失值和异常值:
for ind,each_data in enumerate(sheet_datas[:-1]):
sheet_datas[ind] = each_data.dropna()
sheet_datas[ind] = each_data[each_data['订单金额'] > 1]
sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max()
去除缺失值、无意义数据,增加一列记录日期最大值,方便后续进行计算分析
2)汇总数据:
data_merge = pd.concat(sheet_datas[:-1],axis=0)
上下合并
3)获取各自年份数据并#转换日期间隔为数字
data_merge['date_interval'] = data_merge['max_year_date']-data_merge['提交日期']
data_merge['year'] = data_merge['提交日期'].dt.year
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x: x.days)
4)按会员ID做汇总并进行重命名
rfm_gb = data_merge.groupby(['year','会员ID'],as_index=False).agg({'date_interval': 'min',
'提交日期': 'count',
'订单金额': 'sum'})
rfm_gb.columns = ['year','会员ID','r','f','m']
rfm_gb.head()