<Zhuuu_ZZ>HIVE(终)项目:电子商务消费行为分析

5 篇文章 0 订阅

一、前言

这是学习hive中遇到的一个项目,如果有不妥的地方或者更好的建议,欢迎指出!
我们主要进行以下一些练习:

数据结构
数据清洗
基于Hive的数据分析

二、项目需求

首先和大家讲一下这个项目的需求:

对某零售企业最近1年门店收集的数据进行数据分析

潜在客户画像
用户消费统计
门店的资源利用率
消费的特征人群定位
数据的可视化展现

三、数据结构

本次练习一共使用了如下四张表,有需要数据源的同学们直接留下邮箱即可。
在这里插入图片描述

  • Customer 表
    • 问题:language字段数据存在错误
      在这里插入图片描述
  • Transaction表
    • 问题:表中transaction_id有重复,但数据有效,需要修复数据
      在这里插入图片描述
  • Store 表
    在这里插入图片描述
  • Review 表
    • 问题:表中有无效的score数据,表中有将transaction_id映射到错误的store_id
      在这里插入图片描述

四、建表和数据清洗

创建外部表

  • Create HDFS Folder
hdfs dfs -mkdir -p /shopping/data/customer
hdfs dfs -mkdir -p /shopping/data/transaction
hdfs dfs -mkdir -p /shopping/data/store
hdfs dfs -mkdir -p /shopping/data/review
  • Upload the file to HDFS
hdfs dfs -put /root/customer_details.csv /shopping/data/customer/
hdfs dfs -put /root/transaction_details.csv /shopping/data/transaction/
hdfs dfs -put /root/store_details.csv /shopping/data/store/
hdfs dfs -put /root/store_review.csv /shopping/data/review/
  • Create database
drop database if exists shopping cascade
create database shopping
  • Use database
use shopping
  • Create external table
    • 创建四张对应的外部表,也就是本次项目中的近源表。
create external table if not exists ext_customer_details(
customer_id string,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties("separatorchar"=",","escapeChar"="\\")
stored as textfile
location '/shopping/data/customer/'
tblproperties('skip.header.line.count'='1')

create external table if not exists ext_transaction_details(
transaction_id string,
customer_id string,
store_id string,
price double,
product string,
purchase_date string,
purchase_time string
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties("separatorchar"=",","escapeChar"="\\")
stored as textfile
location '/shopping/data/transaction'
tblproperties('skip.header.line.count'='1')


create external table if not exists ext_store_details(
store_id string,
store_name string,
employee_number int
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties("separatorchar"=",","escapeChar"="\\")
stored as textfile
location '/shopping/data/store/'
tblproperties('skip.header.line.count'='1')

create external table if not exists ext_store_review(
transaction_id string,
store_id string,
review_score int
)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties("separatorchar"=",","escapeChar"="\\")
stored as textfile
location '/shopping/data/review'
tblproperties('skip.header.line.count'='1')

数据清洗

  • 找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash
    Clean and Mask customer_details 创建明细表
create table if not exists customer_details 
as select customer_id,first_name,last_name,md5(email) email,gender,md5(address) address,country,job,credit_type,md5(credit_no) 
from ext_customer_details

  • 重新组织transaction数据按照日期YYYY-MM做分区
create table if not exists transaction_details(
transaction_id string,
customer_id string,
store_id string,
price double,
product string,
purchase_date string,
purchase_time string
)
partitioned by (partday string)
ROW FORMAT serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
with serdeproperties("separatorchar"=",","escapeChar"="\\")
stored as textfile

开启动态分区

set hive.exec.dynamic.partition=true
set hive.exec.dynamic.partition.mode=nonstrict
  • 对transaction_details中的重复数据生成新ID
-- partday 分区 transaction_id 重复 
select if(t.ct=1,transaction_id,concat(t.transaction_id,'_',t.ct-1)) 
transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,date_format(purchase_date,'yyyy-MM') 
as month 
from (select *,row_number() over(partition by transaction_id) as ct 
from ext_transaction_details) t
insert into transaction_details partition(month) 
select if(t.ct=1,transaction_id,concat(t.transaction_id,'_',t.ct-1)) transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,date_format(regexp_replace(purchase,'/','-'),'yyyy-MM') 
as month from (select *,row_number() over(partition by transaction_id) as ct 
from ext_transaction_details) t 
  • 过滤掉store_review中没有评分的数据
create table store_review 
as select transaction_id,store_id,nvl(review_score,ceil(rand()*5)) 
as review_score from ext_store_review

五、数据分析

Customer分析

  • 找出顾客最常用的信用卡
select credit_type,count(credit_type) as peoplenum from customer_details
group by credit_type order by peoplenum desc limit 1
  • 找出客户资料中排名前五的职位名称
select job,count(job) as jobnum from customer_details
group by job
order by jobnum desc
limit 5
  • 美国女性最常用的信用卡
select credit_type,count(credit_type) as femalenum from customer_details 
where gender='Female'
group by credit_type
order by femalenum desc
limit 1

Transaction分析

  • 计算每月总收入
select month,sum(price) as countMoney from transaction_details group by month
  • 计算每个季度的总收入
select year(purchase_date) years,concat('季度',floor((month(purchase_date)-1)/3+1)) quarters,sum(price) from transaction_details 
group by year(purchase_date),concat('季度',floor((month(purchase_date)-1)/3+1))
  • 按年计算总收入
select year(purchase_date),sum(price) from transaction_details 
group by year(purchase_date)
  • 按工作日计算总收入
select concat('星期',dayofweek(regexp_replace(purchase_date,'/','-'))-1),sum(price) from transaction_details
where dayofweek(regexp_replace(purchase_date,'/','-'))-1<>6 and dayofweek(regexp_replace(purchase_date,'/','-'))-1<>0 
group by concat('星期',dayofweek(regexp_replace(purchase_date,'/','-'))-1)
  • 按时间段计算总收入(需要清理数据)
with
t1 as (select price,if(instr(purchase_time,'PM')<>0,split(purchase_time,':')[0]+12,split(purchase_time,':')[0]) time from transaction_details  ),
t2 as (select *,case when time>5 and time<=8 then 'early_mornig'
when time>8 and time<=11 then 'mornig'
when time>11 and time<=13 then 'noon'
when time>13 and time<=18 then 'afternoon'
when time>18 and time<=22 then 'evening'
else  'night' end as timeduan  from t1 )
select t2.timeduan,sum(t2.price) from t2
group by t2.timeduan
  • 按时间段计算平均消费
with
t1 as (select price,if(instr(purchase_time,'PM')<>0,split(purchase_time,':')[0]+12,split(purchase_time,':')[0]) time from transaction_details  ),
t2 as (select *,case when time>5 and time<=8 then 'early_mornig'
when time>8 and time<=11 then 'mornig'
when time>11 and time<=13 then 'noon'
when time>13 and time<=18 then 'afternoon'
when time>18 and time<=22 then 'evening'
else  'night' end as timeduan  from t1 )
select t2.timeduan,avg(t2.price) from t2
group by t2.timeduan
  • 按工作日计算平均消费
select concat('星期',dayofweek(regexp_replace(purchase_date,'/','-'))-1),avg(price) from transaction_details
where dayofweek(regexp_replace(purchase_date,'/','-'))-1<>6 and dayofweek(regexp_replace(purchase_date,'/','-'))-1<>0 
group by concat('星期',dayofweek(regexp_replace(purchase_date,'/','-'))-1)
  • 计算年、月、日的交易总数
select distinct * from
(select store_id,count(transaction_id) over(partition by store_id,
month(purchase_date)) m,count(transaction_id) over(partition by store_id,
year(purchase_date)) y,count(transaction_id) over(partition by store_id,
day(purchase_date)) d  from transaction_details) a
  • 找出交易量最大的10个客户
select customer_id,count(1) c from transaction_details group by customer_id
order by c desc limit 10
  • 找出消费最多的前10位顾客
select customer_id,sum(price) s form transaction_details group by customer_id
order by s desc limit 10
  • 统计该期间交易数量最少的用户
select customer_id,count(1) c from transaction_details group by customer_id
order by c  limit 1
  • 计算每个季度的独立客户总数
select store_id, year(purchase_date),concat('季度',floor((month(purchase_date)-1)/3+1)),
count(distinct customer_id) from transaction_details 
group by store_id,year(purchase_date),concat('季度',floor((month(purchase_date)-1)/3+1))
  • 计算每周的独立客户总数
select store_id, year(purchase_date),weekofyear(purchase_date),
count(distinct customer_id) from transaction_details 
group by store_id, year(purchase_date),weekofyear(purchase_date)
  • 计算整个活动客户平均花费的最大值
select customer_id,avg(price) a from transaction_details group by customer_id
order by a desc limit 1
  • 统计每月花费最多的客户
with
t1 as (select month,customer_id,sum(price) s from transaction_details group by month,customer_id),
t2 as (select month,customer_id,s,row_number() over(partition by month order by s desc  ) r from t1)
select * from t2 where r=1;
  • 统计每月访问次数最多的客户
with
t1 as (select customer_id,month,count(*) c from transaction_details group by customer_id,month),
t2 as (select customer_id,month,s,row_number() over(partition by month order by c desc  ) r from t1)
select * from t2 where r=1;
  • 按总价找出最受欢迎的5种产品
select product,sum(price) as sale from transaction_details 
group by product
order by sale desc
limit 5
  • 根据购买频率找出最畅销的5种产品
select product,count(*) as num from transaction_details 
group by product
order by num desc
limit 5
  • 根据客户数量找出最受欢迎的5种产品
select product,count(distinct customer_id) as num from transaction_details
group by product
order by num desc
limit 5
  • 验证前5个details
select * from transaction_details where product in ('Goat - Whole Cut')

Store 分析

  • 按客流量找出最受欢迎的商店
with 
t1 as (select store_id,count(*) as visit from transaction_details 
group by 
store_id order by visit desc limit 1)
select s.store_name,t.visit 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id
  • 根据顾客消费价格找出最受欢迎的商店
with 
t1 as (select store_id,sum(price) as sale from transaction_details 
group by 
store_id order by sale desc limit 1)
select s.store_name,t.sale 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id
  • 根据顾客交易情况找出最受欢迎的商店
with
t1 as 
(select store_id,store_name from ext_store_details)
select t.store_id,store_name,count(distinct t.transaction_id) as num
from transaction_details t
inner join t1 s
on s.store_id=t.store_id
group by t.store_id,store_name
order by num desc
limit 1
  • 根据商店和唯一的顾客id获取最受欢迎的产品
with
t1 as (select store_id,product,count(distinct customer_id) as num from transaction_details
group by store_id,product order by num desc limit 1)
select s.store_name,t.num,t.product 
from t1 t 
inner join 
ext_store_details s 
on t.store_id=s.store_id
  • 获取每个商店的员工与顾客比
with
t1 as (select store_id,count(distinct customer_id) as num from transaction_details
group by store_id )
select s.store_name,employee_number/num as vs from t1 t
inner join ext_store_details s 
on t.store_id=s.store_id
  • 按年和月计算每家店的收入
select store_id,month,sum(price) from transaction_details group by store_id,month
  • 按店铺制作总收益饼图
select store_id,sum(price) from transaction_details group by store_id
  • 找出每个商店最繁忙的时间段
with
t1 as
(select store_id,concat(split(purchase_time,':')[0],'时') timeduan,count(customer_id) as peoplenum from transaction_details group by store_id,concat(split(purchase_time,':')[0],'时')),
t2 as
(select store_id,timeduan,peoplenum,rank() over(partition by store_id order by peoplenum desc) as peo from t1 )
select t2.store_id,timeduan,e.store_name,t2.peoplenum from t2 
inner join ext_store_details e
on e.store_id = t2.store_id
where peo =1;
  • 找出每家店的忠实顾客
with
t1 as
(select customer_id,store_id,count(customer_id) as visit from transaction_details group by store_id,customer_id ),
t2 as
(select customer_id,store_id,visit,row_number() over(partition by store_id order by visit desc) as most from t1)
select r.customer_id,concat(first_name,last_name) as customer_name,r.store_id,store_name,r.visit from t2 r
inner join customer_details c
on c.customer_id=r.customer_id
inner join ext_store_details e
on e.store_id=r.store_id
where most=1
  • 根据每位员工的最高收入找出明星商店
with
t1 as
(select store_id,sum(price) as sumprice from transaction_details group by store_id)
select t.store_id,s.store_name,sumprice/employee_number as avgprice  from t1 t
inner join ext_store_details s
on s.store_id=t.store_id
order by avgprice desc
limit 1

Review 分析

  • 在ext_store_review中找出存在冲突的交易映射关系
select t.transaction_id,t.store_id from transaction_details t
inner join ext_store_review e
on e.transaction_id=t.transaction_id
where e.store_id!=t.store_id
  • 了解客户评价的覆盖率
with
trans as (select store_id,count(transaction_id) as countSale from transaction_details group by store_id),
rev as (select store_id,count(distinct transaction_id) as review from store_review group by store_id)
select s.store_name,(r.review*100/t.countSale) as cover from  trans t 
inner join rev r 
on t.store_id=r.store_id 
inner join ext_store_details s
on t.store_id=s.store_id
  • 根据评分了解客户的分布情况
select store_id,review_score,count(review_score) as numview from ext_store_review  where review_score>0 group by review_score,store_id
  • 根据交易了解客户的分布情况
select store_id,count(transaction_id) as transactionnum from ext_store_review  group by store_id
  • 客户给出的最佳评价是否总是同一家门店
with 
t1 as (select customer_id,count(distinct s.store_id) c,count(s.store_id) c1 from store_review s join transaction_details t on substr(transaction_id,0,length(transaction_id)-2)=s.stransaction_id where split(transaction_id,'_')[1]<>'2'
and review_score=5
group by customer_id),
t2 as (select customer_id,'yes' review from t1 where t1.c=1 and t1.c1>1 ),
t3 as (select customer_id,'no' review from t1 where t1.c>1)
select customer_id, review from t2 union all select customer_id,review from t3;
  • 3
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值