one.txt
yongpeng weidong weinan
sanfeng luozong xiaoming
two.txt
longlong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin
three.txt
shuaige changmo zhenqiang
dongli lingu xuanxuan
创建如下目录:
WholeFileDriver.class
package com.atguigu.inputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
public class WholeFileDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(WholeFileDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(WholeFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("d:\\input"));
FileOutputFormat.setOutputPath(job,new Path("d:\\output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0:1);
}
}
WholeFileInputFormat.class
package com.atguigu.inputformat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* 自定义RR,处理一个文件,把这个文件直接读成一个KV值
*/
public class WholeFileInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
return new WholeFileRecordReader();
}
}
WholeFileRecordReader.class
package com.atguigu.inputformat;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class WholeFileRecordReader extends RecordReader<Text, BytesWritable> {
private boolean notRead = true;
private Text key = new Text();
private BytesWritable value = new BytesWritable();
private FSDataInputStream inputStream;
private FileSplit fs;
/**
* 初始化方法,框架会在开始的时候调用一次
* @param split
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
// 转换切片类型的文件切片
fs= (FileSplit) split;
// 通过切片获取类型
Path path = fs.getPath();
// 通过路径获取文件系统
FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
// 开流
inputStream = fileSystem.open(path);
}
/**
* 读取下一组KV值
* 如果读到了,返回true,读完了返回False
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (notRead){
// 具体读文件的过程
// 读Key
key.set(fs.getPath().toString());
// 读Value
byte[] buf = new byte[(int) fs.getLength()];
inputStream.read(buf);
value.set(buf, 0,buf.length);
notRead = false;
return true;
}else{
return false;
}
}
/**
* 获取当前读到的Key
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
/**
* 获取当前的Value
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
/**
* 当前数据读取的进度
* @return 当前进度
* @throws IOException
* @throws InterruptedException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return notRead?0:1;
}
@Override
public void close() throws IOException {
IOUtils.closeStream(inputStream);
}
}
输出结果
SEQorg.apache.hadoop.io.Text"org.apache.hadoop.io.BytesWritable ?凓怵焖?4RJ旲 L file:/d:/input/one.txt 1yongpeng weidong weinan
sanfeng luozong xiaoming N file:/d:/input/three.txt 1shuaige changmo zhenqiang
dongli lingu xuanxuan u file:/d:/input/two.txt Zlonglong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin