例题：MR求年龄文件中奇数行和偶数行的平均值

最新推荐文章于 2021-09-04 11:55:25 发布

Geek白先生

最新推荐文章于 2021-09-04 11:55:25 发布

阅读量247

点赞数

分类专栏： Hadoop 文章标签： MR RecordReader TextInputFormat

本文链接：https://blog.csdn.net/weixin_43699817/article/details/99702106

版权

Hadoop 专栏收录该内容

13 篇文章 1 订阅

订阅专栏

解题思路

将原有的<k1,v1>(偏移量,行值) --> <k1,v1>(行号,行值)，使其按行号一行一行的读

重写TextInputFormat类
构建【LineNumInputFormat.class】类作用：创建行号阅读器和设置可切分
重写俩个方法
–>createRecordReader()
return new LineNumRecordReader();
–>isSplitable()
return false; //按行号读取，不可切分
重写LineRecordReader类
构建【LineNumRecordReader.class】类作用：将kv转换为行号和行值
a. 构造initialize() 方法
{
start, end, pos, in, 赋值
}
b. 构造nextKeyValue() 方法
{
读取一行，将行号赋值给key，行值赋值给value
}
c.构造getCurrentKey()
{
return key; //返回当前key值
}
d.构造getCurrentValue()
{
return value; //返回当前value值
}
e.构造getProgress()
return 0；//返回当前进程为0
f.构造close()
in.close(); //关闭流
构造Mapper类
判断行号的奇偶，并输出行值
构造Reduce类
求出平均数并写出数据
构造Job类

各类源码

LineNumInputFormat.class

package com.dragon.hadoop.mr.age;
import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/*
 * 行号输入格式化类
 */
public class LineNumInputFormat  extends FileInputFormat<LongWritable, Text>{

	@Override
	public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		
		return new LineNumRecordReader(); 
	}	
	
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
		
		return false;//设置不可切分
	}
	
}

FileNumRecordReader.class

package com.dragon.hadoop.mr.age;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

/*
 * 行号阅读器
 */
public class LineNumRecordReader extends RecordReader<LongWritable, Text>{
	
	  private long start;
	  private long pos;//原来的偏移量，现在为行号
	  private long end;
	  private LineReader in;
	  private FSDataInputStream fileIn;
	  private LongWritable key;
	  private Text value;
	

	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
		 	FileSplit _split = (FileSplit) split; //得到文件切分的对象
		 	Path file=_split.getPath();//获取文件
		 	FileSystem fs=file.getFileSystem(context.getConfiguration());//获取分布式文件系统
		    fileIn =fs.open(file);//打开文件，获取流
		    fileIn.seek(start);
		    in = new LineReader(fileIn);//传入流获取LineReader
		    start = _split.getStart();
		    end = start + _split.getLength();
		    pos=1;//行号为1		
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		if (key == null) {
			key = new LongWritable();
		}
		key.set(pos);//键赋值
		
		if (value == null) {
			value = new Text();
		}
		//从InputStream读入到给定的文本
		if (in.readLine(value)==0){
			return false;	//已经读完，没有下一个键值可读，返回false
		}
		pos++;				//一行一行的读
		return true;
	}

	@Override
	public LongWritable getCurrentKey() throws IOException, InterruptedException {
		//返回当前的key
		return key;
	}

	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		//返回当前的value
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		//进度0
		return 0;
	}

	@Override
	public void close() throws IOException {
		//关闭流
		in.close();
	}

}

LineNumMapper.class

package com.dragon.hadoop.mr.age;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class LineNumMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>{
	
	IntWritable i=new IntWritable();
	
	@Override
	protected void map(LongWritable key, Text value,
			Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
			throws IOException, InterruptedException {
		
		i.set(Integer.parseInt(value.toString()));//value转成整型	
		if(key.get()%2==0){
			//用2来代表偶数行，并写出数据
			context.write(new IntWritable(2), i);
		}else{
			//用3来代表奇数行，并写出数据
			context.write(new IntWritable(3), i);
		}
	}
}

LineNumReducer.class

package com.dragon.hadoop.mr.age;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class LineNumReducer extends Reducer<IntWritable, IntWritable, Text, IntWritable>{
	private Text _key=new Text();
	@Override
	protected void reduce(IntWritable key, Iterable<IntWritable> values,
			Reducer<IntWritable, IntWritable,Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		int sum=0;
		int i=0;
		for (IntWritable value :values ) {
			sum+=value.get();
			i++;
		}
		
		if(key.get()==2){
			_key.set("偶数的平均值：");
		}else{
			_key.set("奇数的平均值：");
		}
		context.write(_key,new IntWritable(sum/i));
		//直接输出两个结果,参数类型一个IntWritable,NullWritable
		//context.write(NullWritable.get(),new IntWritable(sum/i));
	}
}

LineNumJob.class

package com.dragon.hadoop.mr.age;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class LineNumJob {
	public static void main(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
		
		Configuration conf =new Configuration();
		conf.set("mapreduce.framework.name", "local");
		Path outfile =new Path("file:///f:/jg");
		FileSystem fs=outfile.getFileSystem(conf);
		if(fs.exists(outfile)){
			fs.delete(outfile,true);
		}
		
		try {
			Job job =Job.getInstance(conf);
			//通过主类全名搜索对应的jar包
			job.setJarByClass(LineNumJob.class);
			//设置mapper类
			job.setMapperClass(LineNumMapper.class);
			//设置reducer类
			job.setReducerClass(LineNumReducer.class);
			//设置job的名字
			job.setJobName("LineNumJob");
			
			//设置map阶段输出键值对的类型
			job.setMapOutputKeyClass(IntWritable.class);
			job.setMapOutputValueClass(IntWritable.class);
			
			//设置reduce阶段的输出键值对的类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			
			//设置InputFormatClass
			job.setInputFormatClass(LineNumInputFormat.class);
					
			//设置job工作的文件输入路径
			FileInputFormat.setInputPaths(job, new Path("file:///f:/linenum"));
			
			//设置job的文件输出路径
			FileOutputFormat.setOutputPath(job,outfile);
				System.exit(job.waitForCompletion(true) ? 0 : 1);									
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}

}