【Hive】（十四）Hive 项目实战之电子商务消费行为分析_hive项目实战(1)

2301_79772893

于 2024-04-30 07:43:14 发布

阅读量291

点赞数 5

分类专栏：程序员文章标签： hive hadoop 数据仓库

本文链接：https://blog.csdn.net/2301_79772893/article/details/138333182

版权

程序员专栏收录该内容

184 篇文章 2 订阅

订阅专栏

Clear data and import data into transaction_details

-- partday 分区 transaction\_id 重复 
select if(t.ct=1,transaction_id,concat(t.transaction_id,'\_',t.ct-1)) 
transaction_id,customer_id,store_id,price,product,buydate,buytime,date_format(buydate,'yyyy-MM') 
as partday 
from (select \*,row_number() over(partition by transaction_id) as ct 
from ext_transaction_details) t

insert into transaction_details partition(partday) 
select if(t.ct=1,transaction_id,concat(t.transaction_id,'\_',t.ct-1)) transaction_id,customer_id,store_id,price,product,buydate,buytime,date_format(regexp_replace(buydate,'/','-'),'yyyy-MM') 
as partday from (select \*,row_number() over(partition by transaction_id) as ct 
from ext_transaction_details) t

row_number() over(partition by transaction_id) 窗口函数：从1开始，按照顺序，生成分组内记录的序列，row_number()的值不会存在重复，当排序的值相同时，按照表中记录的顺序进行排列这里我们对分组的transaction_id
if(t.ct=1,transaction_id,concat(t.transaction_id,'_',t.ct-1)) 如果满足ct=1,就是transaction_id,否则进行字符串拼接生成新的id

在这里插入图片描述
Clean store_review table

create table store_review 
as select transaction_id,store_id,nvl(review_score,ceil(rand()\*5)) 
as review_score from ext_store_review

NVL(E1, E2)的功能为：如果E1为NULL，则函数返回E2，否则返回E1本身。
在这里插入图片描述
我们可以看到表中的数据存在空值，通过NVL函数对数据进行填充。

show tables

在这里插入图片描述
通过清洗后的近源表和明细表如上。

数据分析

Customer分析

找出顾客最常用的信用卡

select credit_type,count(credit_type) as peoplenum from customer_details
group by credit_type order by peoplenum desc limit 1

找出客户资料中排名前五的职位名称

select job,count(job) as jobnum from customer_details
group by job
order by jobnum desc
limit 5

在美国女性最常用的信用卡

select credit_type,count(credit_type) as femalenum from customer_details 
where gender='Female'
group by credit_type
order by femalenum desc
limit 1

按性别和国家进行客户统计

select count(\*) as customernum,country,gender from customer_details
group by country,gender

Transaction分析

计算每月总收入

select partday,sum(price) as countMoney from transaction_details group by partday

计算每个季度的总收入
Create Quarter Macro 定义季度宏，将时间按季度进行划分

create temporary macro 
calQuarter(dt string) 
concat(year(regexp_replace(dt,'/','-')),'年第',ceil(month(regexp_replace(dt,'/','-'))/3),'季度')

select calQuarter(buydate) as quarter,sum(price) as sale 
from transaction_details group by calQuarter(buydate)

在这里插入图片描述

按年计算总收入

create temporary macro calYear(dt string) year(regexp_replace(dt,'/','-'))

select calYear(buydate) as year,sum(price) as sale from transaction_details group by calYear(buydate)

按工作日计算总收入

create temporary macro calWeek(dt string) concat('星期',dayofweek(regexp_replace(dt,'/','-'))-1)

select concat('星期',dayofweek(regexp_replace(buydate,'/','-'))-1) as week,sum(price) as sale 
from transaction_details group by dayofweek(regexp_replace(buydate,'/','-'))

在这里插入图片描述

按时间段计算总收入(需要清理数据)

select concat(regexp_extract(buytime,'[0-9]{1,2}',0),'时') as time,sum(price) as sale from transaction_details group by regexp_extract(buytime,'[0-9]{1,2}',0)

在这里插入图片描述

按时间段计算平均消费
Time macro

create temporary macro calTime(time string) if(split(time,' ')[1]='PM',regexp_extract(time,'[0-9]{1,2}',0)+12,
if(split(time,' ')[1]='AM',regexp_extract(time,'[0-9]{1,2}',0),split(time,':')[0]))

select calTime(buytime) as time,sum(price) as sale from transaction_details group by calTime(buytime)

在这里插入图片描述

--define time bucket 
--early morning: (5:00, 8:00]
--morning: (8:00, 11:00]
--noon: (11:00, 13:00]
--afternoon: (13:00, 18:00]
--evening: (18:00, 22:00]
--night: (22:00, 5:00] --make it as else, since it is not liner increasing
--We also format the time. 1st format time to 19:23 like, then compare, then convert minites to hours
with
t1 as
(select calTime(buytime) as time,sum(price) as sale from transaction_details group by calTime(buytime) order by time),
t2 as
(select if(time>5 and time<=8,'early morning',if(time >8 and time<=11,'moring',if(time>11 and time <13,'noon',
if(time>13 and time <=18,'afternoon',if(time >18 and time <=22,'evening','night'))))) as sumtime,sale 
from t1)
select sumtime,sum(sale) from t2 
group by sumtime

在这里插入图片描述

按工作日计算平均消费

select concat('星期',dayofweek(regexp_replace(buydate,'/','-'))-1) 
as week,avg(price) as sale from transaction_details 
where dayofweek(regexp_replace(buydate,'/','-'))-1 !=0 and dayofweek(regexp_replace(buydate,'/','-'))-1 !=6
group by dayofweek(regexp_replace(buydate,'/','-'))

在这里插入图片描述

计算年、月、日的交易总数

select buydate as month,count(\*) as salenum from transaction_details group by buydate

找出交易量最大的10个客户

select c.customer_id,c.first_name,c.last_name,count(c.customer_id) as custnum from customer_details c
inner join transaction_details t
on c.customer_id=t.customer_id
group by c.customer_id,c.first_name,c.last_name
order by custnum desc
limit 10

找出消费最多的前10位顾客

select c.customer_id,c.first_name,c.last_name,sum(price) as sumprice from customer_details c
inner join transaction_details t
on c.customer_id=t.customer_id
group by c.customer_id,c.first_name,c.last_name
order by sumprice desc
limit 10

统计该期间交易数量最少的用户

select c.customer_id,c.first_name,c.last_name,count(\*) as custnum from customer_details c
inner join transaction_details t
on c.customer_id=t.customer_id
group by c.customer_id,c.first_name,c.last_name
order by custnum asc
limit 1

计算每个季度的独立客户总数

select calQuarter(buydate) as quarter,count(distinct customer_id) as uninum
from transaction_details
group by calQuarter(buydate)

计算每周的独立客户总数

select calWeek(buydate) as quarter,count(distinct customer_id) as uninum
from transaction_details
group by calWeek(buydate)

计算整个活动客户平均花费的最大值

select sum(price)/count(\*) as sale
from transaction_details
group by customer_id
order by sale desc
limit 1

统计每月花费最多的客户

with 
t1 as
(select customer_id,partday,count(distinct buydate) as visit from transaction_details group by partday,customer_id),
t2 as
(select customer_id,partday,visit,row_number() over(partition by partday order by visit desc) as visitnum from t1)
select \* from t2 where visitnum=1

统计每月访问次数最多的客户

with
t1 as
(select customer_id,partday,sum(price) as pay from transaction_details group by partday,customer_id),
t2 as
(select customer_id,partday,pay,row_number() over(partition by partday order by pay desc) as paynum from t1)
select \* from t2 where paynum=1

按总价找出最受欢迎的5种产品

select product,sum(price) as sale from transaction_details 
group by product
order by sale desc
limit 5

根据购买频率找出最畅销的5种产品

select product,count(\*) as num from transaction_details 
group by product
order by num desc
limit 5

根据客户数量找出最受欢迎的5种产品

select product,count(distinct customer_id) as num from transaction_details
group by product
order by num desc
limit 5

验证前5个details

select \* from transaction_details where product in ('Goat - Whole Cut')

Store分析

按客流量找出最受欢迎的商店

with 
t1 as (select store_id,count(\*) as visit from transaction_details 
group by 
store_id order by visit desc limit 1)
select s.store_name,t.visit 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id

根据顾客消费价格找出最受欢迎的商店

with 
t1 as (select store_id,sum(price) as sale from transaction_details 
group by 
store_id order by sale desc limit 1)
select s.store_name,t.sale 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id

根据顾客交易情况找出最受欢迎的商店

with
t1 as 
(select store_id,store_name from ext_store_details)
select t.store_id,store_name,count(distinct t.customer_id) as num
from transaction_details t
inner join t1 s
on s.store_id=t.store_id
group by t.store_id,store_name
order by num desc
limit 1

根据商店和唯一的顾客id获取最受欢迎的产品

with
t1 as (select store_id,product,count(distinct customer_id) as num from transaction_details
group by store_id,product order by num desc limit 1)
select s.store_name,t.num,t.product 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id

获取每个商店的员工与顾客比

with
t1 as (select store_id,count(distinct customer_id) as num from transaction_details
group by store_id )
select s.store_name,employee_number/num as vs from t1 t
inner join ext_store_details s 
on t.store_id=s.store_id

按年和月计算每家店的收入

select store_id,partday,sum(price) from transaction_details group by store_id,partday

按店铺制作总收益饼图

select store_id,sum(price) from transaction_details group by store_id

找出每个商店最繁忙的时间段

with
t1 as
(select store_id,count(customer_id) as peoplenum from transaction_details group by store_id,concat(regexp_extract(buytime,'[0-9]{1,2}',0),'时')),
t2 as
(select store_id,peoplenum,row_number() over(partition by store_id order by peoplenum desc) as peo from t1 )
select t.store_id,e.store_name,t.peoplenum from t2 t
inner join ext_store_details e
on e.store_id = t.store_id
where peo =1

找出每家店的忠实顾客

with
t1 as
(select customer_id,store_id,count(customer_id) as visit from transaction_details group by store_id,customer_id ),
t2 as
(select customer_id,store_id,visit,row_number() over(partition by store_id order by visit desc) as most from t1)
select r.customer_id,concat(first_name,last_name) as customer_name,r.store_id,store_name,r.visit from t2 r
inner join customer_details c
on c.customer_id=r.customer_id
inner join ext_store_details e
on e.store_id=r.store_id
where most=1

根据每位员工的最高收入找出明星商店

with


![img](https://img-blog.csdnimg.cn/img_convert/f857b02b4782d7acb96c53d1dbaf52f4.png)
![img](https://img-blog.csdnimg.cn/img_convert/38f98989e0c9bba630f783cb837976ba.png)

**网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。**

**[需要这份系统化资料的朋友，可以戳这里获取](https://bbs.csdn.net/topics/618545628)**


**一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！**

omer_name,r.store_id,store_name,r.visit from t2 r
inner join customer_details c
on c.customer_id=r.customer_id
inner join ext_store_details e
on e.store_id=r.store_id
where most=1

根据每位员工的最高收入找出明星商店

with


[外链图片转存中...(img-TvHX3lhp-1714434157205)]
[外链图片转存中...(img-9RHjePcm-1714434157205)]

**网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。**

**[需要这份系统化资料的朋友，可以戳这里获取](https://bbs.csdn.net/topics/618545628)**


**一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！**