需求描述
统计硅谷影音视频网站的常规指标,各种TopN指标:
- 统计视频观看数Top10
- 统计视频类别热度Top10
- 统计视频观看数Top20所属类别
- 统计视频观看数Top50所关联视频的所属类别Rank
- 统计每个类别中的视频热度Top10
- 统计每个类别中视频流量Top10
- 统计上传视频最多的用户Top10以及他们上传的视频
- 统计每个类别视频观看数Top10
数据清洗
将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割。
核心代码
package com.atguigu.etl;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Mapper;
import org.junit.Test;
import org.junit.internal.runners.statements.Fail;
import java.io.IOException;
public class ETLMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
/* 空格去掉,分隔符改成‘&’ */
private Counter pass;
private Counter fail;
private StringBuilder sb = new StringBuilder();
private Text result=new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
pass = context.getCounter("ETL","Pass");
fail = context.getCounter("ETL", "Fail");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//一行数据
String line = value.toString();
//数据切分
String[] fields = line.split("\t");
//判断字段个数
if (fields.length>=9){
//去掉第四个字段的空格
fields[3] = fields[3].replace(" ","");
sb.setLength(0);
//拼接字段成一行
for (int i = 0; i < fields.length; i++) {
if (i == fields.length-1){
sb.append(fields[i]);
}else if (i <= 8){
//如果拼的是前九个字段
sb.append(fields[i]).append("\t");
}else {
sb.append(fields[i]).append("&");
}
}
result.set(sb.toString());
context.write(result, NullWritable.get());
pass.increment(1);
}else {
fail.increment(1);//不要了
}
}
}
建立video表和user表
create external table video_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
location '/guivideo/video/video_etl';
--user表
create external table user_ori(
uploader string,
videos int,
friends int)
row format delimited fields terminated by "\t"
location '/guivideo/user';
简单查询一下:
- video表
- user表
以上为外部表,需要再建立内部表
create table video_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
--user_orc表
create table user_orc(
uploader string,
videos int,
friends int)
stored as orc
tblproperties("orc.compress"="SNAPPY");
插入数据
--从外部表中插入数据
insert into table video_orc select * from video_ori;
insert into table user_orc select * from user_ori;
需求实现
1.统计视频观看数top10
SELECT
videoid,
views
FROM
video_orc
ORDER BY
views DESC
LIMIT 10;
- 结果
2. 统计视频类别热度Top10
思路:
(1)即统计每个类别有多少个视频,显示出包含视频最多的前10个类别。
(2)我们需要按照类别group by聚合,然后count组内的videoId个数即可。
(3)因为当前表结构为:一个视频对应一个或多个类别。所以如果要group by类别,需要先将类别进行列转行(展开),然后再进行count即可。
(4)最后按照热度排序,显示前10条。
SELECT
cate,
COUNT(videoid) n
FROM
(
SELECT
videoid,
cate
FROM
video_orc LATERAL VIEW explode(category) tbl as cate) t1
GROUP BY
cate
ORDER BY
n desc
limit 10;
3.统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
思路:
(1)先找到观看数最高的20个视频所属条目的所有信息,降序排列
(2)把这20条信息中的category分裂出来(列转行)
(3)最后查询视频分类名称和该分类下有多少个Top20的视频
SELECT
cate,
COUNT(videoid) n
FROM
(
SELECT
videoid,
cate
FROM
(
SELECT
videoid,
views,
category
FROM
video_orc
ORDER BY
views DESC
LIMIT 20 ) t1 LATERAL VIEW explode(category) tbl as cate ) t2
GROUP BY
cate
ORDER BY
n DESC;
4.统计视频观看数Top50所关联视频的所属类别排序
- 统计观看数前50的视频的关联视频
SELECT
videoid,
views,
relatedid
FROM
video_orc
ORDER BY
views DESC
LIMIT 50;
- 炸开关联视频
SELECT
explode(relatedid) videoid
FROM
t1;
- 和原表Join获取关联视频的类别
SELECT
DISTINCT t2.videoid,
v.category
FROM
t2
JOIN video_orc v on
t2.videoid = v.videoid;
- 炸开类别
SELECT
explode(category) cate
FROM
t3;
最终代码:
SELECT
DISTINCT t4.cate,
t5.n
FROM
(
SELECT
explode(category) cate
FROM
(
SELECT
DISTINCT t2.videoid,
v.category
FROM
(
SELECT
explode(relatedid) videoid
FROM
(
SELECT
videoid,
views,
relatedid
FROM
video_orc
ORDER BY
views DESC
LIMIT 50 ) t1 ) t2
JOIN video_orc v on
t2.videoid = v.videoid ) t3 ) t4
JOIN (
SELECT
cate,
COUNT(videoid) n
FROM
(
SELECT
videoid,
cate
FROM
video_orc LATERAL VIEW explode(category) tbl as cate) g1
GROUP BY
cate ) t5 ON
t4.cate = t5.cate
ORDER BY
t5.n DESC;
5. 统计每个类别中的视频热度Top10,以Music为例
- 把视频表的类别炸开,生成中间表格video_category
CREATE
TABLE
video_category STORED AS orc TBLPROPERTIES("orc.compress"="SNAPPY") AS SELECT
videoid,
uploader,
age,
cate,
length,
views,
rate,
ratings,
comments,
relatedid
FROM
video_orc LATERAL VIEW explode(category) tbl as cate;
- 从video_category直接查询Music类的前10视频
SELECT
videoid,
views
FROM
video_category
WHERE
cate ="Music"
ORDER BY
views DESC
LIMIT 10;
结果
6. 统计每个类别中视频流量Top10,以Music为例
- 从video_category直接查询Music类的流量前10视频
SELECT
videoid,
ratings
FROM
video_category
WHERE
cate ="Music"
ORDER BY
ratings DESC
LIMIT 10;
7.统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
理解一:前十用户每人前20
- 统计视频上传最多的用户Top10
SELECT
uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10;
- 和video_orc联立,找出这些用户上传的视频,并按照热度排名
SELECT
t1.uploader,
v.videoid,
RANK() OVER(PARTITION BY t1.uploader ORDER BY v.views DESC) hot
FROM
t1
LEFT JOIN video_orc v ON
t1.uploader = v.uploader;
- 求每个人前20
SELECT
t2.uploader,
t2.videoid,
t2.hot
FROM
t2
WHERE
hot <= 20;
最终代码:
SELECT
t2.uploader,
t2.videoid,
t2.hot
FROM(
SELECT
t1.uploader,
v.videoid,
RANK() OVER(PARTITION BY t1.uploader ORDER BY v.views DESC) hot
FROM(
SELECT
uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10)
t1
LEFT JOIN video_orc v ON
t1.uploader = v.uploader)
t2
WHERE
hot <= 20;
理解二:前十用户总榜前20
- 统计视频上传最多的用户Top10
SELECT
uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10;
- 观看数前20的视频
SELECT
videoid,
uploader,
views
FROM
video_orc
ORDER BY
views DESC
LIMIT 20;
- 联立两表,看看有没有他们上传的
SELECT
t1.uploader,
t2.videoid
FROM
t1
LEFT JOIN t2 ON
t1.uploader = t2.uploader;
最终代码:
SELECT
t1.uploader,
t2.videoid
FROM(
SELECT
uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10)
t1
LEFT JOIN (
SELECT
videoid,
uploader,
views
FROM
video_orc
ORDER BY
views DESC
LIMIT 20
)t2 ON
t1.uploader = t2.uploader;
结果:一个都没有…
8. 统计每个类别视频观看数Top10
- 从video_category表查出每个类别视频观看数排名
SELECT
cate,
videoid,
views,
RANK() OVER(PARTITION BY cate ORDER BY views DESC) hot
FROM
video_category;
- 取每个类别的Top10
SELECT
cate,
videoid,
views
FROM
t1
WHERE
hot <= 10;
最终代码
SELECT
cate,
videoid,
views
FROM(
SELECT
cate,
videoid,
views,
RANK() OVER(PARTITION BY cate ORDER BY views DESC) hot
FROM
video_category)
t1
WHERE
hot <= 10;
运行结果: