MapReduce练习-----数字排序并加序号

最新推荐文章于 2023-07-18 15:04:16 发布

_a_0_

最新推荐文章于 2023-07-18 15:04:16 发布

阅读量1.9k

点赞数

分类专栏： # Hadoop

本文链接：https://blog.csdn.net/zyz_home/article/details/79940135

版权

Hadoop 专栏收录该内容

20 篇文章 2 订阅

订阅专栏

数字排序并加序号：

源数据：   		  最后结果：
2                          1  2
32                         2  6
654                        3  15
32                         4  22
15                         5  26
756                        6  32
65223                      7  32
5956                       8  54
22                         9  92
650                        10 650
92                         11 654
26                         12 756
54                         13 5956
6			   14 65223

分析：

代码：

第一个mapreduce程序：对数据进行分区


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 当前这个MR是为了实现 全局排序， 而且每个数值还要加序号
 */
public class IndexNumerMR {

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
		//System.setProperty("HADOOP_USER_NAME", "hadoop");
		FileSystem fs = FileSystem.get(conf);
		Job job = Job.getInstance(conf, "IndexNumerMR");
		job.setJarByClass(IndexNumerMR.class);

		job.setMapperClass(IndexNumerMRMapper.class);
		job.setReducerClass(IndexNumerMRReducer.class);
		job.setMapOutputKeyClass(LongWritable.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(NullWritable.class);
		
		
		job.setNumReduceTasks(3);
		job.setPartitionerClass(MyPartitioner.class);

		//Path inputPath = new Path("/array/input/");
		Path inputPath = new Path("G:/files/mr/day2/q6/input");
		//Path outputPath = new Path("/array/output/");
		Path outputPath = new Path("G:/files/mr/day2/q6/output");
		FileInputFormat.addInputPath(job, inputPath);
		if (fs.exists(outputPath)) {
			fs.delete(outputPath, true);
		}
		FileOutputFormat.setOutputPath(job, outputPath);

		boolean isDone = job.waitForCompletion(true);
		System.exit(isDone ? 0 : 1);
	}

	public static class IndexNumerMRMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {

		private LongWritable keyOut = new LongWritable();
		
		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {

			long outKey = Long.parseLong(value.toString());
			//逐行读取数组中的数据，然后直接输出，根据自己定义的partitioner进行分区。
			keyOut.set(outKey);
			
			context.write(keyOut, NullWritable.get());
		}
	}

	public static class IndexNumerMRReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {

		@Override
		protected void reduce(LongWritable key, Iterable<NullWritable> values, Context context) 
				throws IOException, InterruptedException {

			
			for(NullWritable nvl : values){
				//直接输出
				context.write(key, nvl);
			}
		}

	}
}

自定义partitioner：MyPartitioner 自定义分区规则


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 *  KEY, VALUE 就是mapper组件的输出 key-value的类型
 */
public class MyPartitioner extends Partitioner<LongWritable, NullWritable>{

	@Override
	public int getPartition(LongWritable key, NullWritable value, int numPartitions) {
		
		// 怎么制定分区规则？
		
		if(key.get() < 100){
			return 0;
		}else if(key.get() >=100  && key.get() <= 999){
			return 1;
		}else{
			return 2;
		}
	}

}

第二个mapreduce程序：统计每个分区中有多少条记录


import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/** 
 * 描述：就是为了统计 每个分区中的数据的条数
 */
public class IndexNumerMR_2 {

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
		//System.setProperty("HADOOP_USER_NAME", "hadoop");
		FileSystem fs = FileSystem.get(conf);
		Job job = Job.getInstance(conf, "IndexNumerMR_2");
		job.setJarByClass(IndexNumerMR_2.class);

		job.setMapperClass(IndexNumerMR_2Mapper.class);
		job.setReducerClass(IndexNumerMR_2Reducer.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);

		Path inputPath = new Path("G:/files/mr/day2/q6/input");
		Path outputPath = new Path("G:/files/mr/day2/q6/output2");
		FileInputFormat.addInputPath(job, inputPath);
		if (fs.exists(outputPath)) {
			fs.delete(outputPath, true);
		}
		FileOutputFormat.setOutputPath(job, outputPath);

		boolean isDone = job.waitForCompletion(true);
		System.exit(isDone ? 0 : 1);
	}

	/**
	 * Text, LongWritable
	 * 
	 * key :  对应的分区名
	 * value ： 对应分区中的一个值   ===  1
	 */
	public static class IndexNumerMR_2Mapper extends Mapper<LongWritable, Text, Text, LongWritable> {

		private Text keyOut = new Text();
		private LongWritable ONE = new LongWritable(1);

		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {

			String string = value.toString();
			long outValue = Long.parseLong(string);

			/**
			 *  key ： 对应的分区的名
			 *  value： 该分区中的一个数值
			 */
			if (outValue < 100) {
				keyOut.set("part-r-00000");
			} else if (outValue >= 100 && outValue <= 999) {
				keyOut.set("part-r-00001");
			} else {
				keyOut.set("part-r-00002");
			}

			context.write(keyOut, ONE);

		}
	}

	public static class IndexNumerMR_2Reducer extends Reducer<Text, LongWritable, Text, LongWritable> {

		private LongWritable valueOut = new LongWritable();

		@Override
		protected void reduce(Text key, Iterable<LongWritable> values, Context context) 
				throws IOException, InterruptedException {

			int sum = 0;
			for (LongWritable lw : values) {
				sum += lw.get();
			}

			valueOut.set(sum);

			context.write(key, valueOut);
		}

	}
}

第三个mapreduce程序：给每个分区中的数据进行编号


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 读取第一个mapreduce程序有3个mapTask
 * 然后在每个mapTask节点都把第二个程序的结果都给加载内存中。
 * 事实上就是一个mapjoin的实现。
 */
public class IndexNumerMR_3 {

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();
		//conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
		//System.setProperty("HADOOP_USER_NAME", "hadoop");
		FileSystem fs = FileSystem.get(conf);
		Job job = Job.getInstance(conf, "IndexNumerMR_3");
		job.setJarByClass(IndexNumerMR_3.class);

		job.setMapperClass(IndexNumerMR_3Mapper.class);
		//		job.setReducerClass(IndexNumerMR_3Reducer.class);
		//		job.setMapOutputKeyClass(LongWritable.class);
		//		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(Text.class);

		job.setNumReduceTasks(0);

		//		DistributedCache.addCacheFile(uri, conf);
		//job.addCacheFile(new URI("hdfs://hadoop02:9000/shu/output_ptn_count/part-r-00000"));
		//本地测试使用
		job.addCacheFile(new URI("file:/G:/files/mr/day2/q6/output2/part-r-00000"));

		Path inputPath = new Path("G:/files/mr/day2/q6/output");
		Path outputPath = new Path("G:/files/mr/day2/q6/output3");
		FileInputFormat.addInputPath(job, inputPath);
		if (fs.exists(outputPath)) {
			fs.delete(outputPath, true);
		}
		FileOutputFormat.setOutputPath(job, outputPath);

		boolean isDone = job.waitForCompletion(true);
		System.exit(isDone ? 0 : 1);
	}

	public static class IndexNumerMR_3Mapper extends Mapper<LongWritable, Text, LongWritable, Text> {

		private LongWritable keyOut = new LongWritable();

		/**
		 * 当前这个mapTask的编号起点 
		 */
		private Long indexStart = 1l;

		private Map<String, Long> ptnCountMap = new HashMap<>();

		@Override
		protected void setup(Context context) throws IOException, InterruptedException {

			/**
			 * 就是为了加载
			 * part-r-00000	9
			   part-r-00001	3
			   part-r-00002	2
			         到ptnCountMap中
			 */
			//走集群使用
			//Path[] localCacheFiles = context.getLocalCacheFiles();
			//Path filePath = localCacheFiles[0];
			//BufferedReader br = new BufferedReader(new FileReader(new File(filePath.toUri().toString())));
			
			//本地调试使用
			BufferedReader br = new BufferedReader(new FileReader("G:/files/mr/day2/q6/output2/part-r-00000"));
			String line = null;
			while ((line = br.readLine()) != null) {
				String[] split = line.toString().split("\t");
				ptnCountMap.put(split[0], Long.parseLong(split[1]));
			}
			br.close();

			/**
			 * 仅仅只是为了获取到当前的mapTask要执行编号的编号起点
			 */
			InputSplit inputSplit = context.getInputSplit();
			FileSplit fileSplit = (FileSplit) inputSplit;
			//  name  === "part-r-00000"
			String name = fileSplit.getPath().getName();
			String reduceNo = name.toString().split("-")[2];
			int reduceNumer = Integer.parseInt(reduceNo);

			// 假如当前这个reduceNumer编号是 2 。 那就意味着  indexStart的值应该是   reduceNumer 为  0 和 为 1 的 和
			for (int i = 0; i < reduceNumer; i++) {

				// i  ===  00000  00022
				String strReduceName = "part-r-" + getReduceTaskResultName(i);

				indexStart += ptnCountMap.get(strReduceName);
			}

		}

		private String getReduceTaskResultName(int i) {
			if (i < 10) {
				return "0000" + i;
			} else if (i < 100) {
				return "000" + i;
			} else if (i < 1000) {
				return "00" + i;
			} else if (i < 10000) {
				return "0" + i;
			} else {
				return "" + i;
			}
		}

		@Override
		protected void map(LongWritable key, Text value, Context context) 
				throws IOException, InterruptedException {
			keyOut.set(indexStart);
			context.write(keyOut, value);
			indexStart++;
		}

		@Override
		protected void cleanup(Mapper<LongWritable, Text, LongWritable, Text>.Context context)
				throws IOException, InterruptedException {

			//			IOUtils.closeStream(br);
		}
	}


}