10、实战

数据准备

数据结构

两张表,视频表和用户表

字段备注详细描述
video id视频唯一id11位字符串
uploader视频上传者上传视频的用户名String
age视频年龄视频在平台上的整数天
category视频类别上传视频指定的视频分类
length视频长度整形数字标识的视频长度
views观看次数视频被浏览的次数
rate视频评分满分5分
ratings流量视频的流量,整型数字
conments评论数一个视频的整数评论数
related ids相关视频id相关视频的id,最多20个

ETL原始数据

部分数据如下所示

SapphireHearts  3   5
VanillaFresh76  37  16
Phyr0Musique    5   0
TwistedKitchen  1   1
TheDailyClog    19  4
4a4ron  4   7
StrictlyZ   1   0
VIRAJ818    6   42
v2uEfmWO6z8 mimzi8  601 Film & Animation    361 25341   4.63    78  44  AItaLnpbIAw 0X_of0woMBU yJ0s-kwG_Ro qThc1spQuiY WCEH9RZZKdU 8ZgFpRZeCds PQfyKAo0-ls 5peppnUHf9I Dx3ri8EVEvE ofmQOjZYLDI Ndtp2wP31K4 HEo6U-jkD4I 3DG1XyqgX3w b7M2mrnSZpM 5gJ21l8_4II MqySp7Nq5j0 MC--VwYTHAM 4SSZBAPleVc eVdiIbWT60M Da80HD18tp0

通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频id也使用“&”进行分割

数据清洗

ETLUtils

public class ETLUtils {
    public static String translateDate(String ori) {
        String[] strings = ori.split("\t");
        if(strings.length < 9) {
            return null;
        }
        strings[3] = strings[3].replace(" ","");
        StringBuffer result = new StringBuffer();
        for(int i = 0; i < strings.length; i ++) {
            if(i <= 9) {
                if(i == 0) {
                    result.append(strings[i]);
                }else {
                    result.append("\t" + strings[i]);
                }
            }else {
                result.append("&" + strings[i]);
            }
        }
        return result.toString();
    }
}

ETLMapper

public class ETLMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    private Text text = new Text();
    private NullWritable nullWritable = NullWritable.get();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String translateDate = ETLUtils.translateDate(s);
        if(translateDate != null){
            text.set(translateDate);
            context.write(text, nullWritable);
        }
    }
}

ETLRunner

public class ETLRunner implements Tool {

    private Configuration configuration;

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(configuration);

        job.setJarByClass(ETLRunner.class);

        job.setMapperClass(ETLMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean success = job.waitForCompletion(true);

        return success ? 0 : 1;
    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }

    public static void main(String[] args) throws Exception {
        int run = ToolRunner.run(new ETLRunner(), args);
        System.out.println("run = " + run);
    }
}

上传文件到hdfs

hadoop fs -put user.txt /
hadoop fs -put video/ /

执行数据清洗

hadoop jar etl.jar com.zj.etl.ETLRunner /video /video/output

创建相关表

create table video_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited 
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

create table video_user_ori(
    uploader string,
    videos int,
    friends int)
clustered by (uploader) into 24 buckets
row format delimited 
fields terminated by "\t" 
stored as textfile;

create table video_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
clustered by (uploader) into 8 buckets 
row format delimited fields terminated by "\t" 
collection items terminated by "&" 
stored as orc;

create table video_user_orc(
    uploader string,
    videos int,
    friends int)
clustered by (uploader) into 24 buckets 
row format delimited 
fields terminated by "\t" 
stored as orc;

导入数据

load data inpath "/video" into table video_ori;
load data inpath "/user.txt" into table video_user_ori;
insert into table video_orc select * from video_ori;
insert into table video_user_orc select * from video_user_ori;

需求

  1. 统计视频观看数Top10
  2. 统计视频类别热度Top10
  3. 统计视频观看数Top20所属类别
  4. 统计视频观看数Top50所关联视频的所属类别Rank
  5. 统计每个类别中的视频热度Top10
  6. 统计每个类别中视频流量Top10
  7. 统计上传视频最多的用户Top10以及他们上传的视频
  8. 统计每个类别视频观看数Top10
1. 
select * from video_orc order by views desc limit 10;
2. 
select r.cate,sum(r.views) as sumviews
from 
(select views,cate
from video_orc
lateral view explode(category) tableAlias as cate) r
group by r.cate
order by sumviews desc
limit 10;
3. 
select cate,count(cate) as num
from (select * from video_orc order by views desc limit 20) r
lateral view explode(r.category) tableAlias as cate
group by cate;
4. 
select ca, count(ca) as sumcate,rank() over(sort by count(ca) desc) as rk
from(select o.category as cat
from (select views,id
from video_orc
lateral view explode(relatedId) tableAlias as id
order by views desc
limit 50) r1 join video_orc o on r1.id = o.videoId) r
lateral view explode(r.cat) tableAlias as ca
group by ca;
5. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
6. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.ratings desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;
7. 
select t1.*,vo.*
from(
select uploader, videos, friends 
from video_user_orc
order by videos desc
limit 10) t1 join video_orc vo on t1.uploader = vo.uploader;
8. 
select t.*
from
(select r.*, ROW_NUMBER() over(distribute by r.cate sort by r.views desc) as rk
from
(
select videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedId
from video_orc
lateral view explode(category) tableAlias as cate) r) t
where t.rk <= 10;

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值