题目:
第三阶段模块一:
有三个文件file1、file2、file3,文件中每一行都是一个数字,如下所示。
file1:
2
32
654
32
15
756
65223
file2:
5956
11
650
92
file3:
26
54
6
请编写 MapReduce 程序实现如下需求:
MapReduce 程序读取这三个文件,对三个文件中的数字进行整体升序排序,并输出到一个结果文件中,结果文件中的每一行有两个数字(两个数字之间使⽤用制表符分隔),第一个数字代表排名,第二个数字代表原始数据
期望输出:
1 2
2 6
3 11
4 15
5 26
6 32
7 32
8 54
9 92
10 650
11 654
12 756
13 5956
14 65223
解题思路:
题目要求从小到大排序,并且还需要输出序号,所以需要新建一个Bean对象继承WritableComparable接口,封装id和number并实现自定义排序规则。最后指定输出一个文件,所以需要在driver端指定reducerTask的数据量为1。
程序演示:
新建maven工程并引入相关依赖。
pom文件信息(根据hdaoop实际版本号引入对应版本号jar包):
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
<!-- 单元测试 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
代码块:
orderBean:
package com.lagou.order;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 封装Bean对象,用来保存id和数据
*/
public class OrderBean implements WritableComparable<OrderBean> {
private Integer id;
private Integer number;
public OrderBean() {
}
public OrderBean(Integer id, Integer number) {
this.id = id;
this.number = number;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public Integer getNumber() {
return number;
}
public void setNumber(Integer number) {
this.number = number;
}
// 指定从小到大排序
public int compareTo(OrderBean o) {
if (this.number > o.number){
return 1;
}else {
return -1;
}
}
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeInt(number);
}
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.number = in.readInt();
}
// 指定输出格式为id number
@Override
public String toString() {
return id + "\t" + number;
}
}
Mapper端:
package com.lagou.order;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class OrderMapper extends Mapper<LongWritable, Text,OrderBean, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 将读取的文本数据保存到bean对象中
OrderBean bean = new OrderBean();
// 默认将id设置为1,防止传输时报空指针错误
bean.setId(1);
bean.setNumber(Integer.parseInt(value.toString()));
context.write(bean,NullWritable.get());
}
}
Reducer端
package com.lagou.order;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class OrderReducer extends Reducer<OrderBean, NullWritable,OrderBean,NullWritable> {
// 序号初始为1
int num = 1;
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
// 读取一行序号+1
key.setId(num);
num += 1;
context.write(key,value);
}
}
}
Driver端
package com.lagou.order;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"OrderDriver");
// 指定jar的本地路径
job.setJarByClass(OrderDriver.class);
// 指定Mapper和Reducer类
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
// 指定Mapper的输出kv类型
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
// 指定最终输出kv类型
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
// 指定reducerTask数量
job.setNumReduceTasks(1);
// 执行输入输出参数
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 提交作业
boolean flag = job.waitForCompletion(true);
System.exit(flag?0:1);
}
}