hive电子商务消费行为分析

hive电子商务消费行为分析

1. 掌握Zeppelin的使用

2. 了解数据结构

3.数据清洗

4. 基于Hive的数据分析

1.物料准备

(1)Customer表

customer_details

details

customer_id

Int, 1 - 500

first_name

string

last_name

string

email

string, such as willddy@gmail.com

gender

string, Male or female

address

string

country

string

language

string

job

string, job title/position

credit_type

string, credit card type, such as visa

credit_no

string, credit card number

(2)Store表

store_details

details

store_id

Int, 1 - 5

store_name

string

employee_number

Int, 在store有多少employee

(3)Review表

store_review

details

stransaction_id

Int, 1 - 8000

store_id

Int, 1 - 5

review_store

Int, 1 - 5

(4)Transaction表

transaction_details

details

transaction_id

Int, 1 - 1000

customer_id

Int, 1 - 500

store_id

Int, 1 - 5

price

decimal, such as 5.08

product

string, things bought

date

string, when to purchase

time

string, what time to purchase

2. 实现数据清洗

2.1  找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash

--数据加密处理
select * from ext_customer_details;
create view if not exists vm_customer_details as
select
    customer_id,
    first_name,
    unbase64(last_name) as last_name,
    unbase64(email) as email,
    gender,
    unbase64(address) as address,
    country,
    language,
    job,
    credit_type,
    unbase64(concat(unbase64(credit_no),'kb23')) as credit_no
from ext_customer_details;

show views ;
show tables ;
select * from vw_customer_details;

2.2  重新组织transaction数据按照日期YYYY-MM做分区,并对transaction_details中的重复数据生成新ID

--交易详情表  按月创建分区表

  create table if not exists transaction_details (

    transaction_id string,

    customer_id string,

    store_id string,

    price decimal(8,2),

    product string,

    purchase_date string,

    purchase_time string

  )

  partitioned by (purchase_month string);


------ext_transaction_details  重复数据生成新的id-------------

--查询重复的数据

  with

    base as ( select row_number() over (partition by transaction_id order by 1) as rn, * from ext_transaction_details )

  select * from base where rn > 1;

  

  --打开动态分区

  set hive.exec.dynamic.partition=true;

  set hive.exec.dynamic.partition.mode=nonstrict;

  

  --重复id处理方式  transaction_id_fix_rn  --> 123456_fix_2

  with

    base as (

        select

            transaction_id,

            customer_id,

            store_id,

            price,

            product,

            purchase_date,

            purchase_time,

            from_unixtime(unix_timestamp(purchase_date, 'yyyy-MM-dd'),'yyyy-MM') as purchase_month,

            row_number() over (partition by transaction_id order by store_id) as rn

        from ext_transaction_details)

  insert overwrite table transaction_details partition (purchase_month)

  select `if`(rn=1,transaction_id, concat(transaction_id, '_fix_', rn)), customer_id, store_id,

            price, product, purchase_date, purchase_time,purchase_month from base;

  

  show partitions transaction_details;

2.3  过滤掉store_review中没有评分的数据

%hive

use shopping;

-- select * from ext_store_review where review_score == '';

create view vw_store_review as(

select transaction_id, store_id, review_score from ext_store_review where review_score <> '');

3. Customer分析(zeppelin和datagrip查询)

3.1 找出顾客最常用的信用卡

%hive

use shopping;

select

    country,

    credit_type,

    count(distinct credit_no) as credit_cnt

from vm_customer_details

group by country,credit_type

order by credit_cnt desc;

3.2 找出客户资料中排名前五的职位名称

%hive

use shopping;

select job, count(1) as pn

from vw_customer_details

group by job

order by pn desc limit 10;

3.3 在美国女性最常用的信用卡

%hive

use shopping;

select

    credit_type,

    count(1) as credit_cnt

from vw_customer_details

where country='United States' and gender = 'Female'

group by credit_type

order by credit_cnt desc limit 3;

3.4 按性别和国家进行客户统计

%hive

use shopping;

select

    country,

    gender,

    count(1) as cn

from vw_customer_details

group by country,gender

order by cn desc limit 3;

4. Transaction分析

4.1计算每月总收入

4.2计算每个季度的总收入

4.3按年计算总收入

4.4按工作日计算总收入

4.5按时间段计算总收入(需要清理数据)

4.6按时间段计算平均消费

4.4按工作日计算平均消费

4.8计算年、月、日的交易总数

4.9找出交易量最大的10个客户

4.10找出消费最多的前10位顾客

-- 1计算每月总收入

  select

    purchase_month, sum(price) as monthSUM

  from transaction_details

  group by purchase_month;
 

  -- 2计算每季度总收入

  with

    base as ( select

    price,

    concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as string))as year_quarter

  from transaction_details)

  select  year_quarter, sum(price) as year_quarterSUM from base group by year_quarter;

  

  -- 3计算每年度总收入

  select

    substr(purchase_date,1,4),

    sum(price) as yearSUM

  from transaction_details

  group by substr(purchase_date,1,4);

  

  



  -- 4计算工作日收入

  select

    `dayofweek`(purchase_date),

    sum(price) as daySUM

  from transaction_details

  group by `dayofweek`(purchase_date)

  having `dayofweek`(purchase_date) in (2,3,4,5,6);

  

  -- 5按时间段计算总收入(需要清理数据)

  with

    base as (

        select price,purchase_time,

        if(purchase_time like '%M',from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'),purchase_time) as time_format

        from transaction_details),

    timeformat as (

  select price, purchase_time, time_format, (cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60) as purchase_time_in_hr from base),

t1 as (select price, purchase_time, time_format, purchase_time_in_hr,

    `if`(purchase_time_in_hr > 5 and purchase_time_in_hr <= 8, 'early morning',

    `if`(purchase_time_in_hr > 8 and purchase_time_in_hr <= 11, 'morning',

    `if`(purchase_time_in_hr > 11 and purchase_time_in_hr <= 13, 'noon',

    `if`(purchase_time_in_hr > 13 and purchase_time_in_hr <= 18, 'afternoon',

    `if`(purchase_time_in_hr > 18 and purchase_time_in_hr <= 22, 'evening', 'night'))))) as time_bucket

  from timeformat)

  select time_bucket, sum(price) bucketSUM, avg(price) avgprice from t1 group by time_bucket ;

  
 
   

  

  -- 6按时间段计算平均消费

  

  

  -- 7按工作日计算平均消费

  select

    `dayofweek`(purchase_date),

    avg(price) as dayAVG

  from transaction_details

  where `dayofweek`(purchase_date) in (2,3,4,5,6)

  group by `dayofweek`(purchase_date);

  

  -- 8计算年、月、日的交易总数

  select

    year(purchase_date) currentYear,

    count(1) as sumCount

  from transaction_details

  group by year(purchase_date);

  

  select

    substring(purchase_date,1,7) currentMonth,

    count(1) as sumCount

  from transaction_details

  group by substring(purchase_date,1,7);

  

  select

    day(purchase_date) currentDay,

    count(1) as sumCount

  from transaction_details

  group by day(purchase_date);

  

  -- 9找出交易量最大的10个客户

  with

    base as (

    select customer_id,count(transaction_id) as trans_cnt,sum(price) as customerSUM from transaction_details group by customer_id),

    cust_detail as(

    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,

       dense_rank() over (order by trans_cnt desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)

  select * from cust_detail where rn<=10;

  

  

  -- 10找出消费最多的前10位顾客

  with

    base as (

    select customer_id,sum(price) as customerSUM from transaction_details group by customer_id),

    cust_detail as(

    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,

       dense_rank() over (order by customerSUM desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)

  select * from cust_detail where rn<=10;
 

Mysql中建表查询

7.11统计该期间交易数量最少的用户

7.12计算每个季度的独立客户总数

7.13计算每周的独立客户总数

7.14计算整个活动客户平均花费的最大值

7.15统计每月花费最多的客户

7.16统计每月访问次数最多的客户

7.17按总价找出最受欢迎的5种产品

7.18根据购买频率找出最畅销的5种产品

7.19根据客户数量找出最受欢迎的5种产品

-- 7.11统计该期间交易数量最少的用户

WITH

       t1 AS(SELECT customer_id,COUNT(1) c FROM transaction_details GROUP BY customer_id ORDER BY c),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY t1.c) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;



-- 7.12计算每个季度的独立客户总数

SELECT DISTINCT COUNT(customer_id),

       concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as CHAR(20)))as year_quarter

from transaction_details

GROUP BY year_quarter

ORDER BY (year_quarter+0);



-- 7.13计算每周的独立客户总数

SELECT DISTINCT COUNT(customer_id),

       concat_ws('-',substr(purchase_date,1,4),cast(WEEKOFYEAR(purchase_date) as CHAR(20)))as week_year

from transaction_details

GROUP BY week_year;



-- 7.14计算整个活动客户平均花费的最大值

WITH

       t1 AS(SELECT customer_id, avg(price) as costAVG FROM transaction_details GROUP BY customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY costAVG DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn=1;



-- 7.15统计每月花费最多的客户

WITH

       t1 AS(SELECT purchase_month,customer_id, sum(price) as costSUM FROM transaction_details GROUP BY purchase_month,customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costSUM DESC) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;


-- 7.16统计每月访问次数最多的客户

WITH

       t1 AS(SELECT purchase_month,customer_id, count(customer_id) as costCount FROM transaction_details GROUP BY purchase_month,customer_id),

       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costCount DESC) as rn FROM t1)

SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;



-- 7.17按总价找出最受欢迎的5种产品

WITH

       t1 AS(SELECT product,sum(price) as proSUM FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proSUM DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;



-- 7.18根据购买频率找出最畅销的5种产品

WITH

       t1 AS(SELECT product,COUNT(product) as proCount FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;



-- 7.19根据客户数量找出最受欢迎的5种产品

WITH

       t1 AS(SELECT product,COUNT(customer_id) as proCount FROM transaction_details GROUP BY product),

       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)

SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值