# 生干六个小时 TVT 微博舆情ETL数据分析详解,手把手教你HSQL分析数据

Table格式在这里

this1表部分数据

this2表部分数据

#!/bin/sh
sql=$1 #2.open databelse echo '++++++start hive database wangtianxin++++++' echo '++++++write sql1++++++' hive -e "use 你的数据库;$sql;" > e1.txt
clear
cat e1.txt
echo '----------end--------'



1、(基础分析题)各频道参与博主的人数分布排行，求top5
select num,remark,rank from(select num,remark,dense_rank() over(order by num desc) as rank from(select count(1) num,remark from this2 where remark!=‘null’ group by remark)as aaa) as a where rank <=5;

2、(基础分析题)各频道评论量排行，求top5

select remark,num,num1 from(select remark,num,dense_rank() over(order by num) as num1 from(select remark,sum(num1) as num from (select remark,num1 from (select uid, count(*) num1 from this1 group by uid) as a inner join this2 on a.uid = this2.id )a group by remark) a) a where num1<=5;

3、(基础分析题)各频道评论转发量排行，求top5

String测试

select remark,rk from (select remark ,rank() over(order by a desc)as rk from(select remark,sum(a) as a from (select a,remark from(select uid,sum(city+location) as a from this1 group by uid)as a inner join this2 on this2.id=a.uid) as a group by remark)as a)as a where rk<=5;

table1  select uid,sum(size(split(words, ‘//@’))-1) as nu from this1 group by uid
table2  select remark ,nu from as table1 inner join this2 on table1.uid=this2.id
tbale3  select remark,sum(nu) as n from table2 group by remark
table3  select remark,n,rank() over(order by remark desc) as rk from table2
table4  select remark ,rk ,num from table3 where rk<=5

select remark ,nu from (select uid,sum(size(split(words, ‘//@’))-1) as nu from this1 group by uid) as table1 inner join this2 on table1.uid=this2.id

select remark,sum(nu) as n from (select remark ,nu from (select uid,sum(size(split(words, ‘//@’))-1) as nu from this1 group by uid) as table1 inner join this2 on table1.uid=this2.id) as table2 group by remark

select remark,n,rank() over(order by remark desc) as rk from (select remark,sum(nu) as n from (select remark ,nu from (select uid,sum(size(split(words, ‘//@’))-1) as nu from this1 group by uid) as table1 inner join this2 on table1.uid=this2.id) as table2 group by remark
) as a

select remark ,rk from (select remark,n,rank() over(order by remark desc) as rk from (select remark,sum(nu) as n from (select remark ,nu from (select uid,sum(size(split(words, ‘//@’))-1) as nu from this1 group by uid) as table1 inner join this2 on table1.uid=this2.id) as table2 group by remark) as a)aa where rk<=5

4、(基础分析题)各频道博文评论的回复(再回复)量排行，求top5
select remark ,rk from (select remark,n,rank() over(order by remark desc) as rk from (select remark,sum(nu) as n from (select remark ,nu from (select uid,sum(size(split(words, ‘回复@’))-1) as nu from this1 group by uid) as table1 inner join this2 on table1.uid=this2.id) as table2 group by remark) as a)aa where rk<=5

5、(综合分析题)各频道活跃度排行
***先定义合理的计算口径，然后写对应的SQL语句
step1 : 候选影响因子集合找到

step2 : 拟定一个计算公式

number:参与话题人数
t1:评论量
t2:回复量

select remark,count(uid) as number,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by remark

select remark ,number,rank() over(order by number1.1+t11.5+t2*2 desc) as rk from (select remark,count(uid) as number,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by remark) as a

6、(综合分析题)各频道的博主的影响力排行，求top5
***先定义合理的计算口径，然后写对应的SQL语句
Select id,remark,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by id,remark

Select id,remark,sum(t1+t2) over(partition by id) as af from (Select id,remark,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by id,remark)a

Select remark,id,af,rank() over(partition by remark order by af) from (Select id,remark,sum(t1+t2) over(partition by id) as af from (Select id,remark,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by id,remark)a)a;

Select remark,id,af,rank() over(partition by remark order by af desc) as rk from (Select id,remark,sum(t1+t2) over(partition by id) as af from (Select id,remark,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by id,remark)a)a;

Select remark,id,af,rk from(Select remark,id,af,rank() over(partition by remark order by af desc) as rk from (Select id,remark,sum(t1+t2) over(partition by id) as af from (Select id,remark,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 from this1 inner join this2 on this2.id=this1.uid group by id,remark)a)a)as a where rk <=5

7、(综合分析题)各频道的博文的影响力排行,求top5
***先定义合理的计算口径，然后写对应的SQL语句
Select sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 , uid from this1 group by words,uid

Select uid,t1,t2,remark from(Select sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 , uid from this1 group by words,uid)as a inner join this2 on a.uid=this2.id

Select remark,uid,sum(t1+t2) over(partition by words)as rf from (Select words,uid,t1,t2,remark from(Select words,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 , uid from this1 group by words,uid)as a inner join this2 on a.uid=this2.id)as a

Select remark, uid,rf,dense_rank() over(partition by remark order by rf desc) as rka from(Select words, remark,uid,sum(t1+t2) over(partition by words)as rf from (Select words,uid,t1,t2,remark from(Select words,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 , uid from this1 group by words,uid)as a inner join this2 on a.uid=this2.id)as a)as a

Select remark,uid,rf,rka from(Select remark, uid,rf,rank() over(partition by remark order by rf desc) as rka from(Select words, remark,uid,sum(t1+t2) over(partition by words)as rf from (Select words,uid,t1,t2,remark from(Select words,sum(size(split(words, ‘//@’))-1) as t1, sum(size(split(words, ‘回复@’))-1) as t2 , uid from this1 group by words,uid)as a inner join this2 on a.uid=this2.id)as a)as a)as a where rka <=5

8、(综合分析题)各频道的博主的行为特征分析之每天的活跃时间段分析，

***先定义合理的计算口径，然后写对应的SQL语句
select uid,sum(size(split(words, ‘//@’))-1) over(partition by uid) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by uid) as t2 ,hour(lotime) as time from this1 ;

Select remark, uid, t1, t2, hotime from(select uid,sum(size(split(words, ‘//@’))-1) over(partition by uid) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by uid) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id

Select remark, uid, t1, t2, hotime from(select uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id

Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id

Select remark , uid,sum(t1+t2) over(partition by words),hotime from(Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id) as a

Select remark,uid,rf,hotime,rank() over(order by remark,hotime) from(Select remark , uid,sum(t1+t2) over(partition by words) as rf,hotime from(Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id) as a)as a

Select remark,uid,sum(rf) over(partition by remark,hotime) as tm,hotime,rk1 from (Select remark,uid,rf,hotime,rank() over(order by remark,hotime) as rk1 from(Select remark , uid,sum(t1+t2) over(partition by words) as rf,hotime from(Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id) as a)as a)as a

Select hotime ,remark,tm,rank() over(partition by remark order by tm desc) from(Select distinct hotime ,remark,tm from(Select remark,uid,sum(rf) over(partition by remark,hotime) as tm,hotime,rk1 from (Select remark,uid,rf,hotime,rank() over(order by remark,hotime) as rk1 from(Select remark , uid,sum(t1+t2) over(partition by words) as rf,hotime from(Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id) as a)as a)as a)as a)as a

Select hotime ,remark,tm, rk from(Select hotime ,remark,tm,rank() over(partition by remark order by tm desc) as rk from(Select distinct hotime ,remark,tm from(Select remark,uid,sum(rf) over(partition by remark,hotime) as tm,hotime,rk1 from (Select remark,uid,rf,hotime,rank() over(order by remark,hotime) as rk1 from(Select remark , uid,sum(t1+t2) over(partition by words) as rf,hotime from(Select remark, uid, t1, t2, hotime,words from(select words,uid,sum(size(split(words, ‘//@’))-1) over(partition by lotime) as t1, sum(size(split(words, ‘回复@’))-1) over(partition by lotime) as t2 ,hour(lotime) as hotime from this1 )as a inner join this2 on a.uid=this2.id) as a)as a)as a)as a)as a)as a where rk <=5

08-30 1082

06-01 1万+
07-20 2200
12-09 465
08-14 2090
09-19 2547
07-19 9122
05-05 4645
02-21 992
03-26 1984
11-29 485
01-16 199