一、背景介绍
数据集为两个csv文件
user_info_utf.csv 和 order_info_utf.csv,包含数据如下:
user_info_utf一共有101536条用户数据
order_info_utf一共有539414条订单数据
二、分析需求
- 统计不同月份的消费人数
- 统计用户三月份的回购率
- 统计男女用户的消费频次是否有差异
- 统计多次消费的用户,第一次和最后一次消费间隔
- 统计不同年龄段、用户的消费金额是否有差异
- 统计消费排名前TOP20%用户,贡献了多少额度
三、数据处理
利用MySql进行数据处理
- 导入数据
建立数据库和数据表
CREATE DATABASE orderdata;
CREATE TABLE orderinfo(
order_id INT PRIMARY KEY NOT NULL,
user_id INT,
ispaid VARCHAR(10),
price FLOAT,
paidtime VARCHAR(30));
CREATE TABLE userinfo(
user_id INT PRIMARY KEY,
sex VARCHAR(10),
birth date);
导入数据
# set global local_infile = 'ON';
LOAD DATA LOCAL INFILE 'C:/Users/junli/Desktop/user_info_utf.csv'
INTO TABLE userinfo
FIELDS TERMINATED BY ','
LINES TERMINATED BY 'rn';
LOAD DATA LOCAL INFILE 'C:/Users/junli/Desktop/order_info_utf.csv'
INTO TABLE orderinfo
FIELDS TERMINATED by ','
LINES TERMINATED BY 'rn';
检查数据是否完好
对数据做初步的处理,时间格式调整,方便使用时间日期函数
UPDATE orderinfo
SET paidtime = REPLACE(paidtime,'/','-')
WHERE paidtime is NOT NULL;
UPDATE orderinfo
SET paidtime = str_to_date(paidtime,'%Y-%m-%d %H:i%')
WHERE paidtime is NOT NULL;
2. 数据分析
- 统计不同月份的订单数
SELECT DATE_FORMAT(paidTime,'%Y-%m') '月份',COUNT(DISTINCT userid) '订单数'
FROM orderinfo
WHERE isPaid = '已支付'
GROUP BY DATE_FORMAT(paidTime,'%Y-%m')
- 统计用户三月份的回购率
select COUNT(user_id) '总用户数',SUM(IF(ct>1,1,0)) '回购用户数'
FROM(
select user_id, COUNT(user_id) ct
FROM orderinfo
WHERE DATE_FORMAT(paidtime,'%Y-%m') = '2016-03'
GROUP BY user_id
) tmp;
回购率 = 54799/16916 = 31%
- 统计男女用户的平均消费次数是否有差异·
SELECT r.sex '性别',AVG(r.ct) '平均消费次数'
FROM(
SELECT o1.user_id,u1.sex,COUNT(u1.user_id) AS ct
FROM orderinfo AS o1
INNER JOIN userinfo AS u1
ON o1.user_id = u1.user_id AND u1.sex !='' AND o1.isPaid = '已支付'
GROUP BY o1.user_id,u1.sex
)r
GROUP BY r.sex
- 统计多次消费的用户,第一次和最后一次消费间隔
SELECT user_id,MAX(paidTime) 最后一次消费 ,
MIN(paidTime) 第一次消费 ,DATEDIFF(max(paidTime) ,min(paidTime)) 消费间隔
FROM orderinfo
WHERE isPaid = '已支付'
GROUP BY userid
HAVING COUNT(*)>1
ORDER BY user_id
- 统计不同年龄段、用户的消费金额是否有差异
SELECT r.age,COUNT(r.ct) 人数,AVG(ct) 平均消费次数,sum(ct)消费总次数,ROUND(AVG(su),1) 消费总额
FROM( SELECT u.userid,CASE
WHEN YEAR(NOW())-YEAR(birth)>=90 THEN '90-100'
WHEN YEAR(NOW())-YEAR(birth)>=80 THEN '80-90'
WHEN YEAR(NOW())-YEAR(birth)>=70 THEN '70-80'
WHEN YEAR(NOW())-YEAR(birth)>=60 THEN '60-70'
WHEN YEAR(NOW())-YEAR(birth)>=50 THEN '50-60'
WHEN YEAR(NOW())-YEAR(birth)>=40 THEN '40-50'
WHEN YEAR(NOW())-YEAR(birth)>=30 THEN '30-40'
WHEN YEAR(NOW())-YEAR(birth)>=20 THEN '20-30'
WHEN YEAR(NOW())-YEAR(birth)>=10 THEN '10-20'
WHEN YEAR(NOW())-YEAR(birth)>=00 THEN '0-10'
END age,COUNT(o.user_id) ct,SUM(price) su
FROM userinfo u INNER JOIN orderinfo o ON u.user_id = o.user_id
WHERE birth > '1901-01-01' AND isPaid = '已支付'
GROUP BY userid)r
GROUP BY age
ORDER BY age
- 统计消费排名TOP20%用户,贡献了多少额度
#先求出总消费金额
SELECT SUM(total)
FROM(
SELECT userid,ROUND(SUM(price),2) total
FROM orderinfo
WHERE isPaid = '已支付'
GROUP BY userid
ORDER BY total DESC) tmp;
#再求出前20%的消费金额
SELECT SUM(total)
FROM(
SELECT userid,total, @num := @num+1 ranking
FROM(
SELECT userid,ROUND(SUM(price),2) total
FROM orderinfo
WHERE isPaid = '已支付'
GROUP BY userid
ORDER BY total DESC)tmp1,
(SELECT @num:=0)tmp2
)tmp3
WHERE ranking <= @num *0.2
占比 = 272202457.60/ 318503081.98 = 84.5%
可见消费排名前20%的用户的消费金额,占总消费金额的84.5%,这些重点客户需要维持并促进消费
在这里只是利用MySql对数据做了简单的处理与分析,要想形成分析报告,还需要利用别的工具绘制图形,图表结合输出报告。