大数据学习作业-hadoop
题目
有三个文件,文件中每一行都是一个数字,请编写程序,对三个文件中的数字整体进行降序排列结果文件中每一行有两个数字,第一个数字代表排名,第二个数字代表原始数据。
期望结果如下
1 2
2 6
3 15
4 22
5 26
6 32
7 32
8 54
9 92
解题思路
1 由于要对数据进行全排序,所以应该所有的数据最终都由一个reduce task进行处理
2 使用降序排列自定义类并且重写比较器进行排序,map端的输入<LongWritable,Text>输出<自定义类,NullWritable>
3 数据排序之外,还要显示数据的序号,可以利用reduce一个task逐行对数据进行处理,定义静态变量,递增 reduce端输出<LongWriatble,自定义类>
自定义类
package com.lagou.hadoop.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SortBean implements WritableComparable<SortBean> {
private Long value;
public Long getValue() {
return value;
}
public void setValue(Long value) {
this.value = value;
}
@Override
//降序排列
public int compareTo(SortBean o) {
long thisValue = this.value;
long thatValue = o.value;
return thisValue > thatValue ? -1 : (thisValue == thatValue ? 0 : 1);
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(value);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.value=dataInput.readLong();
}
public String toString() {
return Long.toString(this.value);
}
}
mapper
package com.lagou.hadoop.sort;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SortMapper extends Mapper<LongWritable, Text,SortBean, NullWritable> {
//自定义对象
private SortBean bean = new SortBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
bean.setValue(Long.valueOf(value.toString()));
context.write(bean,NullWritable.get());
}
}
reducer
package com.lagou.hadoop.sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Objects;
public class SortReducer extends Reducer<SortBean, NullWritable,LongWritable,SortBean> {
private static LongWritable sort=new LongWritable(0);
private static Long orgValue=null;
@Override
protected void reduce(SortBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
//当不相等时序号才加一,相等的序号相同
if(!Objects.equals(key.getValue(),orgValue)){
sort.set(sort.get()+1);
orgValue=key.getValue();
}
context.write(sort,key);
}
}
}
driver
package com.lagou.hadoop.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//创建Configuration
Configuration configuration=new Configuration();
//获取job
Job wordCount = Job.getInstance(configuration, "sort_driver");
//指定map和task类
wordCount.setJarByClass(SortDriver.class);
wordCount.setMapperClass(SortMapper.class);
wordCount.setReducerClass(SortReducer.class);
//设置map阶段输出的key-value
wordCount.setMapOutputKeyClass(SortBean.class);
wordCount.setMapOutputValueClass(NullWritable.class);
//设置reduce阶段输出的key value
wordCount.setOutputKeyClass(LongWritable.class);
wordCount.setOutputValueClass(SortBean.class);
//默认是一个reducetask,不用设置
//输入和输出路径
FileInputFormat.setInputPaths(wordCount,new Path("C:/lagou/test/homework/input"));
FileOutputFormat.setOutputPath(wordCount,new Path("C:/lagou/test/homework/output"));
//任务提交
boolean b = wordCount.waitForCompletion(true);
System.exit(b?0:1);
}
}
运行结果
1 65223
2 5956
3 756
4 654
5 650
6 92
7 54
8 32
8 32
9 26
10 22
11 15
12 6
13 2