准备
LKh7zAJ4nwo TheReceptionist 653 Entertainment 424 13021 4.34 1305 744 DjdA-5oKYFQ NxTDlnOuybo c-8VuICzXtU DH56yrIO5nI W1Uo5DQTtzc E-3zXq_r4w0 1TCeoRPg5dE yAr26YhuYNY 2ZgXx72XmoE -7ClGo-YgZ0 vmdPOOd6cxI KRHfMQqSHpk pIMpORZthYw 1tUDzOp10pk heqocRij5P0 _XIuvoH6rUg LGVU5DsezE0 uO2kj6_D8B4 xiDqywcDQRM uX81lMev6_o
这是一行我们准备清洗的数据,它的每个数据的意思是(依次)
视频唯一id 视频上传者 视频年龄 视频类别 视频长度 观看次数 视频评分 流量 评论数 相关视频id
要注意的是:
-
视频类别:可能有多个分类,中间要以&分割,但是在有的数据中会以如下形式显示
People & Blogs & 中间有空格,我们要处理掉它
-
相关电影id是以tab(" ")分割的,我们要将他换为空格
-
有的电影没有相关电影,我们要将这些数据过滤掉
了解需求后,我们开始做!
环境
IDEA + Maven +hadoop
相关依赖
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
代码
Mapper代码
public class ETLMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
Text k=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取一行
String s = value.toString();
//数据清理
String s1 = ETLUtil.FormatString(s);
//传递数据
if(s1==null) return;
k.set(s1);
context.write(k,NullWritable.get());
}
}
ETL工具类:
package util;
public class ETLUtil {
public static String FormatString(String s){
//切割数据
String[] split = s.split(" ");
//过滤脏数据
if(split.length<9){
return null;
}
//数据替换
split[3]=split[3].replace(" ","");//将空格去掉
StringBuilder sb=new StringBuilder();
//类型拼接
for(int i=0;i<split.length;i++){
if(i<9){
if(i==split.length-1){
sb.append(split[i]);
}else{
sb.append(split[i]+" ");
}
}else {
if(i==split.length-1){
sb.append(split[i]);
}else{
sb.append(split[i]+"&");
}
}
}
return sb.toString();
}
}
驱动类 :
package etl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ETLDriver implements Tool {
private Configuration configuration;
public int run(String[] strings) throws Exception {
//创建Job
Job job = Job.getInstance(configuration);
//设置运行环境
job.setJarByClass(ETLDriver.class);
//设置对应的MapperReduce类
job.setMapperClass(ETLMapper.class);
//设置Mapper输出的
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//设置全局的输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置输出输入路径
FileInputFormat.setInputPaths(job,new Path(strings[0]));
FileOutputFormat.setOutputPath(job,new Path(strings[1]));
//不需要reduce
job.setNumReduceTasks(0);
//提交
job.submit();
return 1;
}
public void setConf(Configuration configuration) {
this.configuration=configuration;
}
public Configuration getConf() {
return configuration;
}
//主函数
public static void main(String[] args) throws Exception{
ToolRunner.run(new ETLDriver(),args);
}
}
测试运行
我们在windows上测试运行了代码,按照要求完成了相应的任务
取一条数据看看
Gnbls__5gdo ggagnisevidal 699 People&Blogs 132 15 0 0 0 FDz8KaArjOA&O1F8tm0kY44&zq_NPp6-zUY&EvtlRc_G9DA&gL5aFyBlucE&1pGjSJD35AU&QGkOy0_uoOM&NbjQ-lTYgvo&_62f9_ylrjg&SX1FY9pxrhw&ITeraiadbJA&ZZZADbubu0Y&4JhAswOQV1Y&mLeOiDF99Yo&BrdO9GagGoM&gij1PytzQNg&wkvCDCOGzGc&5pdG8PZjVog&l8k-5CA2PKY&_iCmluYaOyI
很nice!
打包,放到集群上使用
双击
jar就在这