hive-学习微博日志分析

灯下夜无眠

于 2022-11-28 19:40:01 发布

阅读量568

点赞数

分类专栏： hive 文章标签： hive 学习 hadoop

本文链接：https://blog.csdn.net/llmuzi123456789/article/details/128085796

版权

hive 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

–微博日志分析–
show databases;
use weibo_db ;
–创建数据仓库
create database if not exists weibo_db;
use weibo_db ;
create external table if not exists wb_table(
json string
)
location ‘/data/wb’;

SELECT * from wb_table limit 10;

–数据分析
–微博总量 1451868
SELECT count(*) from wb_table ;

–独立用户数 78540
–SELECT
– COUNT(DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’))
–from (
–SELECT
– SUBSTRING(t.json,2,length(t.json)-2) js
–from wb_table t)t1;

select
count(t2.dis_uid)
from(
SELECT
DISTINCT GET_JSON_OBJECT(t1.js,‘$.userId’) dis_uid
from (SELECT SUBSTRING(t.json,2,length(t.json)-2) js from wb_table t)t1)t2;

–转发维度
–用户所有微博被转发的总数，输出前10个用户
SELECT
t1.id
, sum(t1.reportCount)
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ $userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),'$ .reportCount’) reportCount
from wb_table t)t1
group by t1.id
order by sum(t1.reportCount) desc
limit 10;

–被转发次数最多的前10条微博，输出用户id
SELECT
t1.id , t1.reportCount
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ $userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),'$ .reportCount’) reportCount
from wb_table t) t1
order by t1.reportCount desc
limit 10;

–被点赞次数最多的前10条微博，输出用户id
SELECT
t1.id , t1.praiseCount
from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ $userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),'$ .praiseCount’) praiseCount
from wb_table t) t1
order by t1.praiseCount desc
limit 10;

–每个用户发布的微博总数
SELECT count(t1.id) from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
from wb_table t)t1
group by t1.id
order by count(t1.id) desc
limit 10;

–统计带图片的微博数
SELECT
count(*)
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.pic_list’) like ‘%http%’;

–统计使用iphone发微博的独立用户数
SELECT
count(*)
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.source’) like ‘%iphone%’;

–微博中评论次数小于1000的用户id和数据来源
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘ $userId') id , GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),'$ .commentCount’) commentCount
from wb_table t
where GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.commentCount’) < 1000;

–数据ETL
– 将查询的数据导出到mysql中
set hive.map.aggr = true;
–set hive.grouby.mapaggr.checkinterval=100000;
set hive.groupby.skewindata = true;
– 先将查询结果保存到临时表再根据临时表的路径导出即可默认会在/usr/local/hive_dw/weibo_db.db/wb_user_nums
create table wb_user_nums(
uid string ,
nums int
)
row format delimited
fields terminated by ‘,’;

insert overwrite table wb_user_nums
SELECT t1.id, count(*) from (
SELECT
GET_JSON_OBJECT(SUBSTRING(t.json,2,length(t.json)-2),‘$.userId’) id
from wb_table t)t1
group by t1.id;