- 数据准备
user.txt
0.txt
字段以及字段名解析
user表
字段 备注 字段类型
uploader 上传者用户名 string
videos 上传视频数 int
friends 朋友数量 int
视频表:
字段 备注 详细描述
video id 视频唯一 id 11 位字符串
uploader 视频上传者 上传视频的用户名 String
age 视频年龄 视频上传日期和 2007 年 2 月
15 日之间的整数天(Youtube的独特设定)
category 视频类别 上传视频指定的视频分类
length 视频长度 整形数字标识的视频长度
views 观看次数 视频被浏览的次数
rate 视频评分 满分 5 分
ratings 流量 视频的流量,整型数字
conments 评论数 一个视频的整数评论数
related ids 相关视频 id 相关视频的 id,最多 20 个
- 数据清洗
- 通过mapreduce将数据清洗出来,通过观察原始数据形式,可以发现,视频可以有多个所属分类,每个所属分类用&符号分割,且分割的两边有空格字符,同时相关视频也是可以有多个元素,多个相关视频又用“\t”进行分割。为了分析数据时方便对存在多个子元素的数据进行操作,我们首先进行数据重组清洗操作。即:将所有的类别用“&”分割,同时去掉两边空格,多个相关视频 id 也使用“&”进行分割。将数据放到hdfs指定的文件夹里面。
ETL数据清洗
ETLUtils.java
package ETLUtils;
public class ETLUtils {
public static String getETCString(String str){
String[] lines=str.split("\t");
StringBuilder newLines=new StringBuilder();
//1.去掉空格
lines[3]=lines[3].replaceAll(" ","");
//2.过滤不合法的值
if(lines.length<9) return null;
//3.大于9的下标的\t变化成&连接符
for(int i=0;i<lines.length;i++){
newLines.append(lines[i]);
if(i<9){
newLines.append("\t");
}else{
if(i!=lines.length-1){
newLines.append("&");
}
}
}
return newLines.toString();
}
}
ETLMapper
package mapper;
import ETLUtils.ETLUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ETLMapper extends Mapper<Object,Text,NullWritable,Text> {
Text text=new Text();
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String lines=ETLUtils.getETCString(value.toString());
if(StringUtils.isBlank(lines)) return;
text.set(lines);
context.write(NullWritable.get(),text);
}
}
ETLRunner
package runner;
import java.io.IOException;
import mapper.ETLMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ETLRunner implements Tool {
private Configuration conf=null;
@Override
public void setConf(Configuration conf) {
this.conf=conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public int run(String[] args) throws Exception {
conf=this.getConf();
//传输路径变量
conf.set("inpath",args[0]);
conf.set("outpath",args[1]);
Job job=Job.getInstance(conf,"youtub_etl_video");
job.setJarByClass(ETLRunner.class);
job.setMapperClass(ETLMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(0);
this.initInputPath(job);
this.initOutputPath(job);
return job.waitForCompletion(true)? 0:1;
}
private void initInputPath(Job job) throws IOException {
Configuration conf=job.getConfiguration();
String inpaths=conf.get("inpath");
//获取抽象文件系统对象
FileSystem fs=FileSystem.get(conf);
//创建hdfs路径实体对象
Path inpath =new Path(inpaths);
//判断文件系统存在该路径
if(fs.exists(inpath)){
//设置输入路径
FileInputFormat.addInputPath(job,inpath);
}else{
throw new RuntimeException("HDFS目录不存在"+inpaths);
}
}
private void initOutputPath(Job job) throws IOException {
Configuration conf=job.getConfiguration();
String outpath=conf.get("outpath");
FileSystem fs=FileSystem.get(conf);
Path opath=new Path(outpath);
if(fs.exists(opath)){
//存在输出路径,删除输出路径
fs.delete(opath,true);
}
FileOutputFormat.setOutputPath(job,opath);
}
public static void main(String[] args) {
try {
int result=ToolRunner.run(new ETLRunner(),args);
if(result==0){
System.out.println("Success!");
}else{
System.out.println("Fail!");
}
System.exit(result);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
}
- 建立四张表打算用orc的表进行操作(操作效率比较高),但是orc类型的表只能通过insert的形式插入数据,所以需要建立两张ori的表再进行插入操作。
create table youtube_ori( videoId string, uploader string, age int, category array<string>, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as textfile;
创建user ori表 create table youtube_user_ori( uploader string, videos int, friends int) clustered by (uploader) into 24 buckets row format delimited fields terminated by "\t" stored as textfile;
//创建orc表 create table youtube_orc( videoId string, uploader string, age int, category array<string>, length int, views int, rate float, ratings int, comments int, relatedId array<string>) row format delimited fields terminated by "\t" collection items terminated by "&" stored as orc;
create table youtube_user_orc( uploader string, videos int, friends int) clustered by (uploader) into 24 buckets row format delimited fields terminated by "\t" stored as orc;
|
- 导入数据到表ori中
load data inpath '/output/part-m-00000' into table youtube_ori;
load data inpath '/output/user.txt' into table youtube_user_ori;
- 将表导入到orc中
用insert into table A select * from B倒入到orc表
- 数据分析
字段:
视频id 上传者 视频年龄 视频类别 观看长度 观看次数 视频评分 流量 评论数 相关视频
videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId
- 统计视频观看数top10
create table viewstop10 as select videoId,uploader,age,category,length,views,rate,ratings,comments,relatedId from youtube_orc order by views desc limit 10;
- 统计类别热度top10
create table hotTop10 as select t1.category_name as category,count(t1.videoId) as hot from(
select videoId,category_name from youtube_orc lateral view explode(category) t_catetory as category_name ) t1
group by t1.category_name order by hot desc limit 10;
- 统计出视频观看数最高的 20 个视频的所属类别以及类别包含这 Top20 视频的个数
create table top20views_countCategory as
select category_name,count(videoId) as vcount from
(select videoId,category from
( select * from youtube_orc order by views desc limit 20) t1) t2 lateral view explode(category) t_catetory as category_name
group by category_name order by vcount desc;
- 统计视频观看数 Top50 所关联视频的所属类别的热度排名
select category_name,count(views) as mcount from
(select videoId,category_name,views from
(select videoId,category,views from youtube_orc order by views desc limit 50) t1
lateral view explode(category) t_category as category_name
) t2 group by category_name order by mcount desc;
- 统计每个类别中的视频热度 Top10,以 Music 为例
create table youtube_category(
videoId string,
uploader string,
age int,
categoryId string,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as orc;
insert into table youtube_category
select
videoId,
uploader,
age,
categoryId,
length,
views,
rate,
ratings,
comments,
relatedId
from
youtube_orc lateral view explode(category) catetory as categoryId
create table musicTop10 as
select videoId,categoryId,views from youtube_category where categoryId="Music" order by views desc limit 10;
- 统计每个类别中视频流量 Top10,以 Music 为例
create table ratingsTop10 as
select videoId,views,ratings from youtube_category where categoryId="Music" order by ratings desc limit 10;
- 统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频
select t2.videoId,t2.uploader,t2.views,t1.videos from
(select * from youtube_user_orc order by videos desc limit 10) t1 join youtube_orc t2 on t1.uploader=t2.uploader order by t2.views
limit 20;
- 统计每个类别视频观看数 Top10
create table categoryId_views as
select * from
(select videoId,categoryId,views,row_number() over(partition by categoryId order by views desc) rant from youtube_category) t1
where rant<=10;