数据准备
数据结构
两张表,视频表和用户表
字段 | 备注 | 详细描述 |
---|---|---|
video id | 视频唯一id | 11位字符串 |
uploader | 视频上传者 | 上传视频的用户名String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分5分 |
ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
related ids | 相关视频id | 相关视频的id,最多20个 |
ETL原始数据
部分数据如下所示
SapphireHearts 3 5
VanillaFresh76 37 16
Phyr0Musique 5 0
TwistedKitchen 1 1
TheDailyClog 19 4
4a4ron 4 7
StrictlyZ 1 0
VIRAJ818 6 42
v2uEfmWO6z8 mimzi8 601 Film & Animation 361 25341 4.63 78 44 AItaLnpbIAw 0X_of0woMBU yJ0s-kwG_Ro qThc1spQuiY WCEH9RZZKdU 8ZgFpRZeCds PQfyKAo0-ls 5peppnUHf9I Dx3ri8EVEvE ofmQOjZYLDI Ndtp2wP31K4 HEo6U-jkD4I 3DG1XyqgX3w b7M2mrnSZpM 5gJ21l8_4II MqySp7Nq5j0 MC--VwYTHAM 4SSZBAPleVc eVdiIbWT60M Da80HD18tp0
通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割
数据清洗
ETLUtils
public class ETLUtils {
public static String translateDate(String ori) {
String[] strings = ori.split("\t");
if(strings.length < 9) {
return null;
}
strings[3] = strings[3].replace(" ","");
StringBuffer result = new StringBuffer();
for(int i = 0; i < strings.length; i ++) {
if(i <= 9) {
if(i == 0) {
result.append(strings[i]);
}else {
result.append("\t" + strings[i]);
}
}else {
result.append("&" + strings[i]);
}
}
return result.toString();
}
}
ETLMapper
public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text text = new Text();
private NullWritable nullWritable = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String translateDate = ETLUtils.translateDate(s);
if(translateDate != null){
text.set(translateDate);
context.write(text, nullWritable);
}
}
}
ETLRunner
public class ETLRunner implements Tool {
private Configuration configuration;
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(configuration);
job.setJarByClass(ETLRunner.class);
job.setMapperClass(ETLMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
configuration = conf;
}
@Override
public Configuration getConf() {
return configuration;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new ETLRunner(), args);
System.out.println("run = " + run);
}
}
上传文件到hdfs
hadoop fs -put user.txt /
hadoop fs -put video/ /
执行数据清洗
hadoop jar etl.jar com.zj.etl.ETLRunner /video /video/output
创建相关表
create table video_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table video_user_ori(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;
create table video_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
clustered by (uploader) into 8 buckets
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
create table video_user_orc(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;
导入数据
load data inpath "/video" into table video_ori;
load data inpath "/user.txt" into table video_user_ori;
insert into table video_orc select * from video_ori;
insert into table video_user_orc select * from video_user_ori;
需求
- 统计视频观看数Top10
- 统计视频类别热度Top10
- 统计视频观看数Top20所属类别
- 统计视频观看数Top50所关联视频的所属类别Rank
- 统计每个类别中的视频热度Top10
- 统计每个类别中视频流量Top10
- 统计上传视频最多的用户Top10以及他们上传的视频
- 统计每个类别视频观看数Top10
1.
select * from video_orc order by views desc limit 10;
2.
select r.cate,sum(r.views) as sumviews
from
(select views,cate
from video_orc
lateral view explode(category) tableAlias as cate) r
group by r.cate
order by sumviews desc
limit 10;
3.
select cate,count(cate) as num
from (select * from video_orc order by views desc limit 20) r
lateral view explode(r.category) tableAlias as cate
group by cate;
4.
select ca, count(ca) as sumcate,rank() over(sort by count(ca) desc) as rk
from(select o.category as cat
from (select views,id
from video_orc
lateral view explode(relatedId) tableAlias as id
order by views desc
limit 50) r1 join video_orc o on r1.id = o.videoId) r
lateral view explode(r.cat) tableAlias as ca
group by ca;
5.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
6.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.ratings desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
7.
select t1.*,vo.*
from(
select uploader, videos, friends
from video_user_orc
order by videos desc
limit 10) t1 join video_orc vo on t1.uploader = vo.uploader;
8.
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;