解题思路
将原有的<k1,v1>(偏移量,行值) --> <k1,v1>(行号,行值),使其按行号一行一行的读
- 重写TextInputFormat类
构建【LineNumInputFormat.class】类 作用:创建行号阅读器和设置可切分
重写俩个方法
–>createRecordReader()
return new LineNumRecordReader();
–>isSplitable()
return false; //按行号读取,不可切分 - 重写LineRecordReader类
构建【LineNumRecordReader.class】类 作用:将kv转换为行号和行值
a. 构造initialize() 方法
{
start, end, pos, in, 赋值
}
b. 构造nextKeyValue() 方法
{
读取一行,将行号赋值给key,行值赋值给value
}
c.构造getCurrentKey()
{
return key; //返回当前key值
}
d.构造getCurrentValue()
{
return value; //返回当前value值
}
e.构造getProgress()
return 0;//返回当前进程为0
f.构造close()
in.close(); //关闭流 - 构造Mapper类
判断行号的奇偶,并输出行值 - 构造Reduce类
求出平均数并写出数据 - 构造Job类
各类源码
LineNumInputFormat.class
package com.dragon.hadoop.mr.age;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/*
* 行号输入格式化类
*/
public class LineNumInputFormat extends FileInputFormat<LongWritable, Text>{
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new LineNumRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;//设置不可切分
}
}
FileNumRecordReader.class
package com.dragon.hadoop.mr.age;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
/*
* 行号阅读器
*/
public class LineNumRecordReader extends RecordReader<LongWritable, Text>{
private long start;
private long pos;//原来的偏移量,现在为行号
private long end;
private LineReader in;
private FSDataInputStream fileIn;
private LongWritable key;
private Text value;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit _split = (FileSplit) split; //得到文件切分的对象
Path file=_split.getPath();//获取文件
FileSystem fs=file.getFileSystem(context.getConfiguration());//获取分布式文件系统
fileIn =fs.open(file);//打开文件,获取流
fileIn.seek(start);
in = new LineReader(fileIn);//传入流获取LineReader
start = _split.getStart();
end = start + _split.getLength();
pos=1;//行号为1
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
}
key.set(pos);//键赋值
if (value == null) {
value = new Text();
}
//从InputStream读入到给定的文本
if (in.readLine(value)==0){
return false; //已经读完,没有下一个键值可读,返回false
}
pos++; //一行一行的读
return true;
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
//返回当前的key
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
//返回当前的value
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
//进度0
return 0;
}
@Override
public void close() throws IOException {
//关闭流
in.close();
}
}
LineNumMapper.class
package com.dragon.hadoop.mr.age;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class LineNumMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>{
IntWritable i=new IntWritable();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
i.set(Integer.parseInt(value.toString()));//value转成整型
if(key.get()%2==0){
//用2来代表偶数行,并写出数据
context.write(new IntWritable(2), i);
}else{
//用3来代表奇数行,并写出数据
context.write(new IntWritable(3), i);
}
}
}
LineNumReducer.class
package com.dragon.hadoop.mr.age;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class LineNumReducer extends Reducer<IntWritable, IntWritable, Text, IntWritable>{
private Text _key=new Text();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable,Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int sum=0;
int i=0;
for (IntWritable value :values ) {
sum+=value.get();
i++;
}
if(key.get()==2){
_key.set("偶数的平均值:");
}else{
_key.set("奇数的平均值:");
}
context.write(_key,new IntWritable(sum/i));
//直接输出两个结果,参数类型一个IntWritable,NullWritable
//context.write(NullWritable.get(),new IntWritable(sum/i));
}
}
LineNumJob.class
package com.dragon.hadoop.mr.age;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LineNumJob {
public static void main(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf =new Configuration();
conf.set("mapreduce.framework.name", "local");
Path outfile =new Path("file:///f:/jg");
FileSystem fs=outfile.getFileSystem(conf);
if(fs.exists(outfile)){
fs.delete(outfile,true);
}
try {
Job job =Job.getInstance(conf);
//通过主类全名搜索对应的jar包
job.setJarByClass(LineNumJob.class);
//设置mapper类
job.setMapperClass(LineNumMapper.class);
//设置reducer类
job.setReducerClass(LineNumReducer.class);
//设置job的名字
job.setJobName("LineNumJob");
//设置map阶段输出键值对的类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce阶段的输出键值对的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置InputFormatClass
job.setInputFormatClass(LineNumInputFormat.class);
//设置job工作的文件输入路径
FileInputFormat.setInputPaths(job, new Path("file:///f:/linenum"));
//设置job的文件输出路径
FileOutputFormat.setOutputPath(job,outfile);
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (IOException e) {
e.printStackTrace();
}
}
}