Hive小练习之youtobe影音数据处理
一:数据结构
1.视频表
表 6-13 视频表
字段 备注 详细描述
video id 视频唯一 id 11 位字符串
uploader 视频上传者 上传视频的用户名 String
age 视频年龄 视频在平台上的整数天
category 视频类别 上传视频指定的视频分类
length 视频长度 整形数字标识的视频长度
views 观看次数 视频被浏览的次数
rate 视频评分 满分 5 分
Ratings 流量 视频的流量,整型数字
conments 评论数 一个视频的整数评论数
related ids 相关视频 id 相关视频的 id,最多 20 个
2.用户表
表 6-14 用户表
字段 备注 字段类型
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int
二:ETL(清洗)原始数据
通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频 id 也使用“&”进行分割。
编写MapReduce程序,进行对原始数据的ETL
1. 新建Maven工程,导入依赖
<dependencies>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.mrunit</groupId>
<artifactId>mrunit</artifactId>
<version>1.1.0</version>
<classifier>hadoop2</classifier>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
2. Maper类
package com.ityouxin.video;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @program: video
* @description: video的mapper类
* @author: lhx
* @create: 2019-12-01 09:45
**/
public class VideoMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
private Text k2 = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取导行数据
String line = value.toString();
//处理行数据
String[] items = line.split("\t");
String etlstr = etl(items);
k2.set(etlstr);
context.write(k2,NullWritable.get());
}
private String etl(String[] items){
//使用创建者模式进行对string数组处理
StringBuffer stringBuffer = new StringBuffer();
//处理行数据的相应字段
//处理第四个字段
if (items.length<3){
return null;
}
String s = items[3].replace(" ", "");
for (int i = 0; i <items.length ; i++) {
//判断第九个字段,将第九个字段后的拼接到一起,使用&连接起来
if (i<9){
stringBuffer.append(items[i]).append("\t");
}else {
stringBuffer.append(items[i]);
if (i<items.length-1){
stringBuffer.append("&");
}
}
}
return stringBuffer.toString();
}
}
2. Driver类
package com.ityouxin.video;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* @Description:
* @Author: fl
* @CreateTime: 2019-12-01 09:56
*/
public class VideoDriver extends Configured implements Tool {
public int run(String[] args) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(VideoDriver.class);
job.setMapperClass(VideoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
FileSystem fs = FileSystem.get(getConf());
fs.delete(new Path(args[1]),true);
boolean result = job.waitForCompletion(true);
return result?0:1;
}
public static void main(String[] args) {
try {
int run = ToolRunner.run(new VideoDriver(), args);
System.exit(run);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
3. 打包,上传到集群的HDFS文件系统中。
4. 执行java程序,进行数据清洗[lhx@hadoop102 root]$ hadoop jar /opt/module/datas/video-1.0-SNAPSHOT.jar com.ityouxin.video.VideoDriver /video /output
三:数据表准备
创建表:youtubevideo_ori,youtubevideo_user_ori,
创建表:youtubevideo_orc,youtubevideo_user_orc,
youtubevideo_ori。
create table youtubevideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
youtubevideo_user_ori:
create table youtubevideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
将原数据插入orc表中
youtubevideo_orc:
create table youtubevideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
youtubevideo_user_orc:
create table youtubevideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
2. 导入ETL之后的数据
youtubevideo_ori:
load data inpath "/youtubevideo/output/ " into table youtubevideo_ori;
youtubevideo_user_ori:
load data inpath "/youtubevideo/user/" into table youtubevideo_user_ori;
3. 向ORC(ORC文件格式是一种Hadoop生态圈中的列式存储格式)表中插入数据(ORC压缩格式的表数据只能使用insert通过表对表导入)
youtubevideo_orc:
insert into table youtubevideo_orc select * from youtubevideo_ori;
youtubevideo_user_orc:
insert into table youtubevideo_user_orc select * from youtubevideo_user_ori;
三:业务处理
-
统计视频观看数Top10
select * ,count(views)from youtubevideo_orc order by views desc limit 10; 思路:使用 order by 按照 views 字段做一个全局排序即可,同时我们设置只显示前 10 条。 最终代码: 如果出现内存溢出 需要设置:set mapred.child.java.opts=-Xmx1024m;
-
统计视频类别的热度Top10
分析: 1. 视频类别的热度,观看数量?类别下的视频数量?还是安装视频数量来处理 2. 视频类别类型为array《String》,所以需要炸开,作为中间表 select videoid,category_name from youtubevideo_orc lateral view explode(category) t_categroy as category_name ; --t1 3. 统计每个类别下的视频数量,对类别进行分组,对视频数量(视频viewid)进行聚合排序 select categroy_name categroy,count(t1.viewId) hot from t1 group by t1.categroy_name order by hot desc limit 10;--t2 4.整合前两部 select category_name as category, count(t1.videoId) as hot from ( select videoId, category_name from youtubevideo_orc lateral view explode(category) t_catetory as category_name) t1 group by t1.category_name order by hot desc limit 10;
-
统计出视频观看数量最高的20个视频的所属类别以及类别下包含的Top20视频的个数
分析: 1. 最开始先找出观看Top20的视频以及所属的行信息,降序排列 select * from youtubevideo_orc order by views desc limit 20;--t1 2. 类别这一列,如果用到,一定需要炸开,炸开后,取出相对应得类别和id select videoId,categroy_name from t1 lateral view explode(category) t_catetory as category_name; --t2 3. 最后查询视频分类得名称以及该分类下包含Top20得视频个数,根据id聚合出类别下的视频个数,对类别进行分组视频得个数进行排序 select category_name as category,count(t2.videoId) as hot_with_views from t2 group by category_name order by hot_with_views desc;
-
统计视频观看数Top50所关联视频得所属类别排序
1. 首先查询出观看Top50得所有信息 select * from youtubevideo_orc order by views .desc limit 50; --t1 2. 将查询出来得50条视频信息的相关视频relatedId,进行炸开,列转行 select explode(relatedId) as videoId from t1; --t2 3. 将相关视频的id和表内连接,得到两行数据,一行category和viewId select distinct(t2.videoId),t3.catory from t2 inner join youtubevideo_orc t3 on t2.videoId = t3.videoId;-- t4 select videoId,category_name from t4 lateral view explode(category) t_catetory as category_name;--t5 select videoId,category_name from(select distinct(t2.videoId),t3.catory from t2 inner join youtubevideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name;)--t5 4. 按照视频类别进行分组,统计魅族视频个数,然后排行 select category_name as category, count(t5.videoId) as hot from ( select videoId, category_name from ( select distinct(t2.videoId), t3.category from ( select explode(relatedId) as videoId from ( select * from youtubevideo_orc order by views desc limit 50) t1) t2 inner join youtubevideo_orc t3 on t2.videoId = t3.videoId) t4 lateral view explode(category) t_catetory as category_name) t5 group by category_name order by hot desc; --t6
-
统计每个类别中的视频热度Top10.以Music为例
1. 首先要找到Music类别,上面说到凡是需要类别字段的,都需要要进行炸开,另外创建一张表用来存放炸开后的类别数据(表的类别) create table youtubevideo_category( videoId string, uploader string, age int, categoryId string, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" 2. 炸开类别字段,向刚创建的表中插入数据 select videoId,views from youtubevideo_category where categoryId = "music" order by views desc limit 10;
-
统计每个类别中视频流量 Top10,以 Music 为例
同上,先炸开类别,再列转行,然后按照ratings流量进行排序
select
videoId,
views,
ratings
from
youtubevideo_category
where
categoryId = "Music"
order by ratings limit 10;
-
统计使用上传最多的用户Top10以及他们上传的视频观看Top前20的视频
1. 先找到上传TOP10视频的用户 select * from youtubevideo_user_orc order by videos desc limit 10;--t1 2. 将视频表与用户表进行join连接,得到的全表字段 ,按照views进行排序,得到Top前20 select * from t1 join youtubevideo_orc t2 on t1.uploader = t2.uploader order by views desc limit 20; select * from (select * from youtubevideo_user_orc order by videos desc limit 10)t1 join youtubevideo_orc t2 on t1.uploader = t2.uploader order by views desc limit 20;
-
统计每类别下的视频观看数Top10
select t1.* from ( select videoId, categoryId, views, row_number() over(partition by categoryId order by views desc) rn from youtubevideo_category) t1 where rn <= 10;