数字排序并加序号:
源数据: 最后结果:
2 1 2
32 2 6
654 3 15
32 4 22
15 5 26
756 6 32
65223 7 32
5956 8 54
22 9 92
650 10 650
92 11 654
26 12 756
54 13 5956
6 14 65223
分析:
代码:
第一个mapreduce程序:对数据进行分区
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 当前这个MR是为了实现 全局排序, 而且每个数值还要加序号
*/
public class IndexNumerMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
//System.setProperty("HADOOP_USER_NAME", "hadoop");
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf, "IndexNumerMR");
job.setJarByClass(IndexNumerMR.class);
job.setMapperClass(IndexNumerMRMapper.class);
job.setReducerClass(IndexNumerMRReducer.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(3);
job.setPartitionerClass(MyPartitioner.class);
//Path inputPath = new Path("/array/input/");
Path inputPath = new Path("G:/files/mr/day2/q6/input");
//Path outputPath = new Path("/array/output/");
Path outputPath = new Path("G:/files/mr/day2/q6/output");
FileInputFormat.addInputPath(job, inputPath);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
public static class IndexNumerMRMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
private LongWritable keyOut = new LongWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
long outKey = Long.parseLong(value.toString());
//逐行读取数组中的数据,然后直接输出,根据自己定义的partitioner进行分区。
keyOut.set(outKey);
context.write(keyOut, NullWritable.get());
}
}
public static class IndexNumerMRReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
@Override
protected void reduce(LongWritable key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
for(NullWritable nvl : values){
//直接输出
context.write(key, nvl);
}
}
}
}
自定义partitioner:MyPartitioner 自定义分区规则
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* KEY, VALUE 就是mapper组件的输出 key-value的类型
*/
public class MyPartitioner extends Partitioner<LongWritable, NullWritable>{
@Override
public int getPartition(LongWritable key, NullWritable value, int numPartitions) {
// 怎么制定分区规则?
if(key.get() < 100){
return 0;
}else if(key.get() >=100 && key.get() <= 999){
return 1;
}else{
return 2;
}
}
}
第二个mapreduce程序:统计每个分区中有多少条记录
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 描述:就是为了统计 每个分区中的数据的条数
*/
public class IndexNumerMR_2 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
//System.setProperty("HADOOP_USER_NAME", "hadoop");
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf, "IndexNumerMR_2");
job.setJarByClass(IndexNumerMR_2.class);
job.setMapperClass(IndexNumerMR_2Mapper.class);
job.setReducerClass(IndexNumerMR_2Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Path inputPath = new Path("G:/files/mr/day2/q6/input");
Path outputPath = new Path("G:/files/mr/day2/q6/output2");
FileInputFormat.addInputPath(job, inputPath);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
/**
* Text, LongWritable
*
* key : 对应的分区名
* value : 对应分区中的一个值 === 1
*/
public static class IndexNumerMR_2Mapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private Text keyOut = new Text();
private LongWritable ONE = new LongWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String string = value.toString();
long outValue = Long.parseLong(string);
/**
* key : 对应的分区的名
* value: 该分区中的一个数值
*/
if (outValue < 100) {
keyOut.set("part-r-00000");
} else if (outValue >= 100 && outValue <= 999) {
keyOut.set("part-r-00001");
} else {
keyOut.set("part-r-00002");
}
context.write(keyOut, ONE);
}
}
public static class IndexNumerMR_2Reducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private LongWritable valueOut = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (LongWritable lw : values) {
sum += lw.get();
}
valueOut.set(sum);
context.write(key, valueOut);
}
}
}
第三个mapreduce程序:给每个分区中的数据进行编号
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 读取第一个mapreduce程序有3个mapTask
* 然后在每个mapTask节点都把第二个程序的结果都给加载内存中。
* 事实上就是一个mapjoin的实现。
*/
public class IndexNumerMR_3 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
//System.setProperty("HADOOP_USER_NAME", "hadoop");
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf, "IndexNumerMR_3");
job.setJarByClass(IndexNumerMR_3.class);
job.setMapperClass(IndexNumerMR_3Mapper.class);
// job.setReducerClass(IndexNumerMR_3Reducer.class);
// job.setMapOutputKeyClass(LongWritable.class);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(0);
// DistributedCache.addCacheFile(uri, conf);
//job.addCacheFile(new URI("hdfs://hadoop02:9000/shu/output_ptn_count/part-r-00000"));
//本地测试使用
job.addCacheFile(new URI("file:/G:/files/mr/day2/q6/output2/part-r-00000"));
Path inputPath = new Path("G:/files/mr/day2/q6/output");
Path outputPath = new Path("G:/files/mr/day2/q6/output3");
FileInputFormat.addInputPath(job, inputPath);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
public static class IndexNumerMR_3Mapper extends Mapper<LongWritable, Text, LongWritable, Text> {
private LongWritable keyOut = new LongWritable();
/**
* 当前这个mapTask的编号起点
*/
private Long indexStart = 1l;
private Map<String, Long> ptnCountMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
/**
* 就是为了加载
* part-r-00000 9
part-r-00001 3
part-r-00002 2
到ptnCountMap中
*/
//走集群使用
//Path[] localCacheFiles = context.getLocalCacheFiles();
//Path filePath = localCacheFiles[0];
//BufferedReader br = new BufferedReader(new FileReader(new File(filePath.toUri().toString())));
//本地调试使用
BufferedReader br = new BufferedReader(new FileReader("G:/files/mr/day2/q6/output2/part-r-00000"));
String line = null;
while ((line = br.readLine()) != null) {
String[] split = line.toString().split("\t");
ptnCountMap.put(split[0], Long.parseLong(split[1]));
}
br.close();
/**
* 仅仅只是为了获取到当前的mapTask要执行编号的编号起点
*/
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit = (FileSplit) inputSplit;
// name === "part-r-00000"
String name = fileSplit.getPath().getName();
String reduceNo = name.toString().split("-")[2];
int reduceNumer = Integer.parseInt(reduceNo);
// 假如当前这个reduceNumer编号是 2 。 那就意味着 indexStart的值应该是 reduceNumer 为 0 和 为 1 的 和
for (int i = 0; i < reduceNumer; i++) {
// i === 00000 00022
String strReduceName = "part-r-" + getReduceTaskResultName(i);
indexStart += ptnCountMap.get(strReduceName);
}
}
private String getReduceTaskResultName(int i) {
if (i < 10) {
return "0000" + i;
} else if (i < 100) {
return "000" + i;
} else if (i < 1000) {
return "00" + i;
} else if (i < 10000) {
return "0" + i;
} else {
return "" + i;
}
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
keyOut.set(indexStart);
context.write(keyOut, value);
indexStart++;
}
@Override
protected void cleanup(Mapper<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
// IOUtils.closeStream(br);
}
}
}