【填坑之旅-hadoop-05】hadoop2.10.1 mr自定义排序分组FlowSumArea/SortMR Partitioner/ WritableComparable 倒排索引

本文链接：https://blog.csdn.net/alwarse/article/details/121065103

job提交流程–补充版

在这里插入图片描述

map task 把处理结果回报MRAPPmaster
reduce task 把map处理结果合并，处理运算，再输出

1.实现分区的步骤：

1.1先分析一下具体的业务逻辑，确定大概有多少个分区
1.2首先书写一个类，它要继承org.apache.hadoop.mapreduce.Partitioner这个类
1.3重写public int getPartition这个方法，根据具体逻辑，读数据库或者配置返回相同的数字
1.4在main方法中设置Partioner的类，job.setPartitionerClass(DataPartitioner.class);
1.5设置Reducer的数量，job.setNumReduceTasks(6);

AreaPartitioner extends Partitioner<KEY, VALUE>

int getPartition()

package cn.itcast.hadoop.mr.areapartition;

import java.util.HashMap;

import org.apache.hadoop.mapreduce.Partitioner;

public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE>{

	private static HashMap<String,Integer> areaMap = new HashMap<>();
	
	static{
		areaMap.put("135", 0);
		areaMap.put("136", 1);
		areaMap.put("137", 2);
		areaMap.put("138", 3);
		areaMap.put("139", 4);
	}
	
	
	
	
	
	@Override
	public int getPartition(KEY key, VALUE value, int numPartitions) {
		//从key中拿到手机号，查询手机归属地字典，不同的省份返回不同的组号
		
		int areaCoder  = areaMap.get(key.toString().substring(0, 3))==null?5:areaMap.get(key.toString().substring(0, 3));

		return areaCoder;
	}

}

FlowSumArea- FlowSumAreaMapper/FlowSumAreaReducer/main

main
//设置我们自定义的分组逻辑定义
job.setPartitionerClass(AreaPartitioner.class);

package cn.itcast.hadoop.mr.areapartition;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.itcast.hadoop.mr.flowsum.FlowBean;


/**
 * 对流量原始日志进行流量统计，将不同省份的用户统计结果输出到不同文件
 * 需要自定义改造两个机制：
 * 1、改造分区的逻辑，自定义一个partitioner
 * 2、自定义reduer task的并发任务数
 * 
 * @author duanhaitao@itcast.cn
 *
 */
public class FlowSumArea {

	public static class FlowSumAreaMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
		
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			//拿一行数据
			String line = value.toString();
			//切分成各个字段
			String[] fields = StringUtils.split(line, "\t");
			
			//拿到我们需要的字段
			String phoneNB = fields[1];
			long u_flow = Long.parseLong(fields[7]);
			long d_flow = Long.parseLong(fields[8]);
			
			//封装数据为kv并输出
			context.write(new Text(phoneNB), new FlowBean(phoneNB,u_flow,d_flow));

		}
		
		
	}
	
	
	public static class FlowSumAreaReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
		
		@Override
		protected void reduce(Text key, Iterable<FlowBean> values,Context context)
				throws IOException, InterruptedException {

			long up_flow_counter = 0;
			long d_flow_counter = 0;
			
			for(FlowBean bean: values){
				
				up_flow_counter += bean.getUp_flow();
				d_flow_counter += bean.getD_flow();
				
				
			}
			
			context.write(key, new FlowBean(key.toString(), up_flow_counter, d_flow_counter));
			
			
			
		}
		
	}
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(FlowSumArea.class);
		
		job.setMapperClass(FlowSumAreaMapper.class);
		job.setReducerClass(FlowSumAreaReducer.class);
		
		//设置我们自定义的分组逻辑定义
		job.setPartitionerClass(AreaPartitioner.class);
		
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		//设置reduce的任务并发数，应该跟分组的数量保持一致
		job.setNumReduceTasks(1);
		
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		
		System.exit(job.waitForCompletion(true)?0:1);
		
		
	}
	
	
}

2.排序

MR默认是按key2进行排序的，如果想自定义排序规则，被排序的对象要实现WritableComparable接口，在compareTo方法中实现排序规则，然后将这个对象当做key，即可完成排序…
输入

FlowBean implements WritableComparable

FlowBean {phoneNB,up_flow,d_flow}  

@Override
	public int compareTo(FlowBean o) {
		return s_flow>o.getS_flow()?-1:1;
	}

package cn.itcast.hadoop.mr.flowsum,times);

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean>{
	
	
	private String phoneNB;
	private long up_flow;
	private long d_flow;
	private long s_flow;
	private long times;
	
	//在反序列化时，反射机制需要调用空参构造函数，所以显示定义了一个空参构造函数
	public FlowBean(){}
	
	//为了对象数据的初始化方便，加入一个带参的构造函数
	public FlowBean(String phoneNB, long up_flow, long d_flow) {
		this.phoneNB = phoneNB;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.s_flow = up_flow + d_flow;
	}
	
	public void set(String phoneNB, long up_flow, long d_flow) {
		this.phoneNB = phoneNB;
		this.up_flow = up_flow;
		this.d_flow = d_flow;
		this.s_flow = up_flow + d_flow;
	}

	public String getPhoneNB() {
		return phoneNB;
	}

	public void setPhoneNB(String phoneNB) {
		this.phoneNB = phoneNB;
	}

	public long getUp_flow() {
		return up_flow;
	}

	public void setUp_flow(long up_flow) {
		this.up_flow = up_flow;
	}

	public long getD_flow() {
		return d_flow;
	}

	public void setD_flow(long d_flow) {
		this.d_flow = d_flow;
	}

	public long getS_flow() {
		return s_flow;
	}

	public void setS_flow(long s_flow) {
		this.s_flow = s_flow;
	}

	
	public long getTimes() {
		return times;
	}

	public void setTimes(long times) {
		this.times = times;
	}
	//将对象数据序列化到流中
	@Override
	public void write(DataOutput out) throws IOException {

		out.writeUTF(phoneNB);
		out.writeLong(up_flow);
		out.writeLong(d_flow);
		out.writeLong(s_flow);
		
	}

	
	//从数据流中反序列出对象的数据
	//从数据流中读出对象字段时，必须跟序列化时的顺序保持一致
	@Override
	public void readFields(DataInput in) throws IOException {

		phoneNB = in.readUTF();
		up_flow = in.readLong();
		d_flow = in.readLong();
		s_flow = in.readLong();
		
	}
	
	
	@Override
	public String toString() {

		return "" + up_flow + "\t" +d_flow + "\t" + s_flow;
	}

	@Override
	public int compareTo(FlowBean o) {
		return s_flow>o.getS_flow()?-1:1;
	}
	

}

SortMRmain（）、classSortMapper 、SortReducer

class SortMR {
	classSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{}
	SortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{}
	main(String[] args) throws Exception {

}

package cn.itcast.hadoop.mr.flowsort;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.itcast.hadoop.mr.flowsum.FlowBean;

public class SortMR {
	
	
	public static class SortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{
		
		//拿到一行数据，切分出各字段，封装为一个flowbean，作为key输出
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			String line = value.toString();
			
			String[] fields = StringUtils.split(line, "\t");
			
			String phoneNB = fields[0];
			long u_flow = Long.parseLong(fields[1]);
			long d_flow = Long.parseLong(fields[2]);
			
			context.write(new FlowBean(phoneNB, u_flow, d_flow), NullWritable.get());
			
		}
		
		
	}
	
	
	
	public static class SortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{
		
		@Override
		protected void reduce(FlowBean key, Iterable<NullWritable> values,Context context)
				throws IOException, InterruptedException {

			String phoneNB = key.getPhoneNB();
			context.write(new Text(phoneNB), key);
			
		}
		
	}
	
	
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();	
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(SortMR.class);
		
		job.setMapperClass(SortMapper.class);
		job.setReducerClass(SortReducer.class);
		
		job.setMapOutputKeyClass(FlowBean.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//RecordReader<K, V>
		//org.apache.hadoop.mapreduce.RecordWriter<K, V>
		
		System.exit(job.waitForCompletion(true)?0:1);
		
		
		
	}
	
	
	

}

3.combiner的作用

就是在map端对输出先做一次合并，以减少传输到reducer的数据量。

4.MR启动流程

start-mapred.sh  --> hadoop-daemon.sh --> hadoop --> org.apache.hadoop.mapred.JobTracker


Jobtracker调用顺序：main --> startTracker  --> new JobTracker 在其构造方法中首先创建一个调度器，接着创建一个RPC的server（interTrackerServer）tasktracker会通过PRC机制与其通信

然后调用offerService方法对外提供服务，在offerService方法中启动RPC server，初始化jobtracker，调用taskScheduler的start方法 --> eagerTaskInitializationListener调用start方法，
–> 调用jobInitManagerThread的start方法，因为其是一个线程，会调用JobInitManager的run方法 --> jobInitQueue任务队列去取第一个任务，然后把它丢入线程池中，然后调用–>InitJob的run方法
–> jobTracker的initJob方法 --> JobInProgress的initTasks --> maps = new TaskInProgress[numMapTasks]和reduces = new TaskInProgress[numReduceTasks];

TaskTracker调用顺序：main --> new TaskTracker在其构造方法中调用了initialize方法，在initialize方法中调用RPC.waitForProxy得到一个jobtracker的代理对象

接着TaskTracker调用了本身的run方法，–> offerService方法 --> transmitHeartBeat返回值是（HeartbeatResponse）是jobTracker的指令，在transmitHeartBeat方法中InterTrackerProtocol调用了heartbeat将tasktracker的状态通过RPC机制发送给jobTracker,返回值就是JobTracker的指令
heartbeatResponse.getActions()得到具体的指令，然后判断指令的具体类型，开始执行任务
addToTaskQueue启动类型的指令加入到队列当中，TaskLauncher又把任务加入到任务队列当中，–> TaskLauncher的run方法 --> startNewTask方法 --> localizeJob下载资源 --> launchTaskForJob开始加载任务 --> launchTask --> runner.start()启动线程; --> TaskRunner调用run方法 --> launchJvmAndWait启动java child进程

5倒排索引

输入准备 3个txt a b c

每个文件内容如下
a.txt
hello tom
hello jerry
hello tom

b.txt
hello jerry
hello jerry
tom jerry

c.txt
hello jerry
hello tom

倒排索引过程

第一步统计 hello在各个文件出现的次数，分条

---------------------------------mapper

//context.wirte(“hello->a.txt”, “1”)
//context.wirte(“hello->a.txt”, “1”)
//context.wirte(“hello->a.txt”, “1”)

<“hello->a.txt”, {1,1,1}>
---------------------------------reducer
/context.write(“hello”, “a.txt->3”)
//context.write(“hello”, “b.txt->2”)
//context.write(“hello”, “c.txt->2”)

第二步生成用hello为索引key，汇总在在各个文件出现的次数

-----------------------------------------------mapper

//context.write(“hello”, “a.txt->3”)
//context.write(“hello”, “b.txt->2”)
//context.write(“hello”, “c.txt->2”)

<“hello”, {“a.txt->3”, “b.txt->2”, “c.txt->2”}>

-------------------------------- reducer

context.write(“hello”, “a.txt->3 b.txt->2 c.txt->2”)

最终结果

此时在每个文档中出现的次数是没有数据的

hello a.txt->3 b.txt->2 c.txt->2
jerry a.txt->1 b.txt->3 c.txt->1
tom a.txt->2 b.txt->1 c.txt->1

网页搜索引擎搜索hello的时候，展现列表就是 a/b/c.

InverseIndexStepOne StepOneMapper StepOneReducer

package cn.itcast.hadoop.mr.ii;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.itcast.hadoop.mr.flowsort.SortMR;
import cn.itcast.hadoop.mr.flowsort.SortMR.SortMapper;
import cn.itcast.hadoop.mr.flowsort.SortMR.SortReducer;
import cn.itcast.hadoop.mr.flowsum.FlowBean;

/**
 * 倒排索引步骤一job
 * @author duanhaitao@itcast.cn
 *
 */
public class InverseIndexStepOne {
	
	
	
	public static class StepOneMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {

			//拿到一行数据
			String line = value.toString();
			//切分出各个单词
			String[] fields = StringUtils.split(line, " ");
			
			//获取这一行数据所在的文件切片
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			//从文件切片中获取文件名
			String fileName = inputSplit.getPath().getName();
			
			for(String field:fields){
				
				//封装kv输出  ，  k ：  hello-->a.txt     v:  1
				context.write(new Text(field+"-->"+fileName), new LongWritable(1));
				
			}
			
		}
		
		
	}
	
	
	public static class StepOneReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		
		// <hello-->a.txt,{1,1,1....}>
		@Override
		protected void reduce(Text key, Iterable<LongWritable> values,Context context)
				throws IOException, InterruptedException {

			long counter = 0;
			for(LongWritable value:values){
				
				counter += value.get();
				
			}
			
			context.write(key, new LongWritable(counter));
		}
		
		
	}
	
	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();	
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(InverseIndexStepOne.class);
		
		job.setMapperClass(StepOneMapper.class);
		job.setReducerClass(StepOneReducer.class);
		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		//检查一下参数所指定的输出路径是否存在，如果已存在，先删除
		Path output = new Path(args[1]);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(output)){
			fs.delete(output, true);
		}
		
		FileOutputFormat.setOutputPath(job, output);
		
		System.exit(job.waitForCompletion(true)?0:1);
		
		
	}

}

StepTwoMapper StepTwoReducer

package cn.itcast.hadoop.mr.ii;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Reducer;

import cn.itcast.hadoop.mr.ii.InverseIndexStepOne.StepOneMapper;
import cn.itcast.hadoop.mr.ii.InverseIndexStepOne.StepOneReducer;

public class InverseIndexStepTwo {

	
public static class StepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
		
	
	    //k: 行起始偏移量    v:  {hello-->a.txt   3} 
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			
			String line = value.toString();
			
			String[] fields = StringUtils.split(line, "\t");
			String[] wordAndfileName = StringUtils.split(fields[0], "-->");
			
			String word = wordAndfileName[0];
			String fileName = wordAndfileName[1];
			long count = Long.parseLong(fields[1]);
			
			
			context.write(new Text(word), new Text(fileName+"-->"+count));		
			//map输出的结果是这个形式   ： <hello,a.txt-->3>
			
		}
}


	public static class StepTwoReducer extends Reducer<Text, Text,Text, Text>{
		
		@Override
		protected void reduce(Text key, Iterable<Text> values,Context context)
				throws IOException, InterruptedException {

			//拿到的数据  <hello,{a.txt-->3,b.txt-->2,c.txt-->1}>
			
			String result = "";
			
			for(Text value:values){
				
				result += value + " ";
			}
			
			context.write(key, new Text(result));
			//输出的结果就是  k: hello   v: a.txt-->3  b.txt-->2  c.txt-->1  
			
		}
		
	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();	
		
		//先构造job_one
//		Job job_one = Job.getInstance(conf);
//		
//		job_one.setJarByClass(InverseIndexStepTwo.class);
//		job_one.setMapperClass(StepOneMapper.class);
//		job_one.setReducerClass(StepOneReducer.class);
		//......
		
		
		//构造job_two
		Job job_tow = Job.getInstance(conf);
		
		job_tow.setJarByClass(InverseIndexStepTwo.class);
		
		job_tow.setMapperClass(StepTwoMapper.class);
		job_tow.setReducerClass(StepTwoReducer.class);
		
		job_tow.setOutputKeyClass(Text.class);
		job_tow.setOutputValueClass(Text.class);
		
		FileInputFormat.setInputPaths(job_tow, new Path(args[0]));
		
		//检查一下参数所指定的输出路径是否存在，如果已存在，先删除
		Path output = new Path(args[1]);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(output)){
			fs.delete(output, true);
		}
		
		FileOutputFormat.setOutputPath(job_tow, output);
		
		
		//先提交job_one执行
//		boolean one_result = job_one.waitForCompletion(true);
//		if(one_result){
		System.exit(job_tow.waitForCompletion(true)?0:1);
//		}
		
	}

}