淘宝数据可视化大屏案例(Hadoop实验)_hadoop数据分析可视化案例(2)

img
img

网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。

需要这份系统化资料的朋友,可以戳这里获取

一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!

#具体定义sink
agent3.sinks.sink3.type = hive
agent3.sinks.sink3.hive.metastore = thrift://hadoop:9083
agent3.sinks.sink3.hive.database = taobao
agent3.sinks.sink3.hive.table = taobao_data
agent3.sinks.sink3.serializer = DELIMITED
agent3.sinks.sink3.serializer.delimiter = “,”
agent3.sinks.sink3.serializer.serdeSeparator = ‘,’
agent3.sinks.sink3.serializer.fieldnames = user_id,item_id,behavior_type,user_geohash,item_category,date,hour
agent3.sinks.sink3.batchSize = 90

#组装source、channel、sink
agent3.sources.source3.channels = channel3
agent3.sinks.sink3.channel = channel3



vi file_hive.properties


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203230204241.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 该文件用于监听的作用,自动就会在家目录下面,然后我们需要创建几个文件夹,就是下图我备注的那些字段  
 ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203230425148.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)



mkdir -p /home/hadoop/taobao/data
mkdir -p /home/hadoop/taobao/tmp/point


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203230733862.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 3.3 创建数据库


由于版本问题,需要导入指定的jar包



H I V E H O M E / h c a t a l o g / s h a r e / h c a t a l o g / 下的所有包,拷贝入 {HIVE_HOME}/hcatalog/share/hcatalog/下的所有包,拷贝入 HIVEHOME/hcatalog/share/hcatalog/下的所有包,拷贝入{FLUME_HOME}/lib


执行下面的命令:



cd ${HIVE_HOME}/hcatalog/share/hcatalog/

cp * ${FLUME_HOME}/lib/


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203231514996.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 启动hive



hive


创建数据库并使用



create database taobao;
use taobao;


建立表格



create table taobao.taobao_data (
user_id varchar(255) ,
item_id varchar(255) ,
behavior_type varchar(255) ,
user_geohash varchar(255) ,
item_category varchar(255) ,
date varchar(10) ,
hour varchar(3)
)
clustered by(user_id) into 3 buckets
row format delimited fields terminated by ‘,’
stored as orc tblproperties(‘transactional’=‘true’);


创建导出数据表



create table taobao.taobao_result (
key varchar(255) ,
value varchar(255)) ;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203232140412.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 **4.导入数据**


先启动hive --service metastore -p 9083  
 (这个端口号要配置到flume文件中,可用netstat -tulpn | grep 9083查看端口是否监听)



hive --service metastore -p 9083


再去启动flume



flume-ng agent --conf conf --conf-file file_hive.properties -name agent3 -Dflume.hadoop.logger=INFO,console


然后把文件数据导入到,之前创建的data文件夹里面就完成了自动导入



mv /home/hadoop/12yue.csv /home/hadoop/taobao/data/


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203233039439.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)




---


**5.数据分析**


5.1 把总访问量查询出来,导入到结果表



insert into taobao_result
(select “PV”, u. 总访问量 FROM
(select count(*) AS 总访问量 FROM taobao_data) u);


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203233825944.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)



INSERT INTO taobao_result
(SELECT " UV", u. 用户数量 FROM
(SELECT COUNT(DISTINCT user_id) AS 用户数量 FROM taobao_data) u);


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203234429992.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 可以查看一下



select * from taobao_result;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201203234553587.png)




---


5.2 – 浏览页跳失率:用户仅仅有pv行为,没有其它的收藏、加购、购买行为



INSERT INTO taobao_result
(SELECT “跳失率”, u. 总访问量 FROM
(
SELECT b.仅pv用户 / a.总用户 AS 总访问量 FROM
(SELECT count( DISTINCT user_id ) AS 总用户 FROM taobao_data) a ,
(SELECT
count( DISTINCT user_id ) AS 仅pv用户
FROM taobao_data
WHERE
user_id NOT IN ( SELECT DISTINCT user_id FROM taobao_data WHERE behavior_type = ‘2’ ) AND
user_id NOT IN ( SELECT DISTINCT user_id FROM taobao_data WHERE behavior_type = ‘3’ ) AND
user_id NOT IN ( SELECT DISTINCT user_id FROM taobao_data WHERE behavior_type = ‘4’ )) b
) u);


**注意在hive里面不可以使用in not in查询,所以这里要用连接查询解决这个问题**


经过大量的测试,我自己写了一个sq语句,也可以达到以上的效果


**首先把要插入的信息,查询出来**



SELECT “跳失率”, u.总访问量 FROM
(
SELECT b.仅pv用户 / a.总用户 AS 总访问量 FROM
(SELECT count( DISTINCT user_id ) AS 总用户 FROM taobao_data) a,
(SELECT count( DISTINCT user_id ) AS 仅pv用户 from (select * from taobao_data) as c LEFT JOIN (SELECT DISTINCT user_id as id FROM taobao_data WHERE behavior_type = ‘2’ or behavior_type=‘3’ or behavior_type=‘4’) as d on c.user_id=d.id WHERE d.id is NULL ) as b
) as u;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204125833662.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)插入



INSERT INTO taobao_result
(SELECT “跳失率”, u.总访问量 FROM
(
SELECT b.仅pv用户 / a.总用户 AS 总访问量 FROM
(SELECT count( DISTINCT user_id ) AS 总用户 FROM taobao_data) a,
(SELECT count( DISTINCT user_id ) AS 仅pv用户 from (select * from taobao_data) as c LEFT JOIN (SELECT DISTINCT user_id as id FROM taobao_data WHERE behavior_type = ‘2’ or behavior_type=‘3’ or behavior_type=‘4’) as d on c.user_id=d.id WHERE d.id is NULL ) as b
) as u);


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204130102680.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 ![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204130358773.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)  
 优化代码类型(思路类似)



SELECT count(DISTINCT user_id) from datas left JOIN
(select DISTINCT user_id as t from datas WHERE behavior_type =‘2’ or behavior_type =‘3’ or behavior_type =‘4’) as a on user_id=a.t WHERE a.t is null




---


5.3  
 – 有购买行为的用户数量、用户的购物情况、复购率分别是多少?



SELECT COUNT( 1 ) FROM (SELECT u.user_id, SUM( CASE u.behavior_type WHEN “4” THEN 1 ELSE 0 END ) AS buy FROM taobao_data u GROUP BY u.user_id HAVING buy > 0 ) t;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204004144322.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)



SELECT COUNT(1) AS 总数, SUM(CASE u.behavior_type WHEN “1” THEN 1 ELSE 0 END ) AS 点击行为,SUM(CASE u.behavior_type WHEN “2” THEN 1 ELSE 0 END ) AS 收藏行为,SUM(CASE u.behavior_type WHEN “3” THEN 1 ELSE 0 END ) AS 加购物车行为,SUM(CASE u.behavior_type WHEN “4” THEN 1 ELSE 0 END ) AS 购买行为 FROM taobao_data u;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204010030868.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)




---


5.4 – 复购率 购买次数大于2的人占总的人数的比率


必须要输入下面代码


[原因如下](https://bbs.csdn.net/forums/4f45ff00ff254613a03fab5e56a57acb)



set hive.mapred.mode=nonstrict;


注意这个sq语句里面的4,必须也要用单引号括起来



SELECT t2.repeat_buy/t1.total AS 复购率 FROM
(SELECT COUNT(DISTINCT u1.user_id) AS total FROM taobao_data u1) t1 ,
(SELECT COUNT(1) AS repeat_buy FROM
(SELECT u.user_id, SUM(CASE u.behavior_type WHEN ‘4’ THEN 1 ELSE 0 END ) AS buy FROM taobao_data u GROUP BY u.user_id HAVING buy>1) t) t2;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204012114321.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70#pic_center)![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204012145541.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)5.5 分析用户在哪个时间段最为活跃,包括日期和时间



SELECT date,count(*) as t from taobao_data GROUP BY date ORDER BY t DESC;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204191910255.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)



SELECT hour,count(*) as t from taobao_data GROUP BY hour ORDER BY t DESC;


![在这里插入图片描述](https://img-blog.csdnimg.cn/20201204192406302.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NzcyMzczMg==,size_16,color_FFFFFF,t_70)




---


**导出数据**


去自己的Navicat里面执行去试试也可以



CREATE DATABASE taobao;

create table taobao.taobao_result (
key varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
value varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;

img
img
img

既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,涵盖了95%以上大数据知识点,真正体系化!

由于文件比较多,这里只是将部分目录截图出来,全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频,并且后续会持续更新

需要这份系统化资料的朋友,可以戳这里获取

ci ROW_FORMAT = Dynamic;

[外链图片转存中…(img-iBFUAVKK-1715514382701)]
[外链图片转存中…(img-1rpNMD7M-1715514382701)]
[外链图片转存中…(img-3rnWh75a-1715514382701)]

既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,涵盖了95%以上大数据知识点,真正体系化!

由于文件比较多,这里只是将部分目录截图出来,全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频,并且后续会持续更新

需要这份系统化资料的朋友,可以戳这里获取

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值