源数据(aa文件夹下):
1、one.txt
yongpeng weidong weinan
sanfeng luozong xiaoming
2、tow.txt
longlong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin
3、three.txt
shuaige changmo zhenqiang
dongli lingu xuanxuan
1、MyRecordReader.java
package com.fjh.myinputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 用户: Administrator 项目名:hadoop 时间:2020/10/21 17:01
*/
public class MyRecordReader extends RecordReader<Text, BytesWritable> {
private Configuration configuration;
private FileSplit split;
private boolean isprogress = true;
private BytesWritable value = new BytesWritable();
private Text k = new Text();
//重写 initialize
@Override
public void initialize(InputSplit split, TaskAttemptContext taskAttemptContext) {
this.split=(FileSplit) split;
configuration = taskAttemptContext.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (isprogress){
//1 定义缓存区
byte[] bytes = new byte[(int) split.getLength()];
FSDataInputStream fsDataInputStream = null;
try {
//2 获取文件系统
Path path = split.getPath();
FileSystem fileSystem = path.getFileSystem(configuration);
//3 读取数据
fsDataInputStream = fileSystem.open(path);
//4 读取文件内容
IOUtils.readFully(fsDataInputStream,bytes,0,bytes.length);
//5 输出文件内容
value.set(bytes,0,bytes.length);
//6 获取文件内容
String name = split.getPath().toString();
//7 设置输出的 key 值
k.set(name);
}catch (Exception e){
}finally {
IOUtils.closeStream(fsDataInputStream);
}
isprogress = false;
return true;
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return k;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
}
}
2、MyInputFormat.java
package com.fjh.myinputformat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* 用户: Administrator 项目名:hadoop 时间:2020/10/21 16:33
*/
public class MyInputFormat extends FileInputFormat<Text, BytesWritable> {
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
MyRecordReader recordReader = new MyRecordReader();
recordReader.initialize(inputSplit,taskAttemptContext);
return recordReader;
}
//设置是否可切分 false 否
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
3、SequenceFileMapper.java
package com.fjh.myinputformat;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 用户: Administrator 项目名:hadoop 时间:2020/10/21 19:22
*/
public class SequenceFileMapper extends Mapper<Text, BytesWritable,Text,BytesWritable> {
@Override
protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
4、SequenceFileReducer.java
package com.fjh.myinputformat;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* 用户: Administrator 项目名:hadoop 时间:2020/10/21 19:30
*/
public class SequenceFileReducer extends Reducer<Text, BytesWritable,Text,BytesWritable> {
@Override
protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,values.iterator().next());
}
}
5、SequenceFileDriver.java
package com.fjh.myinputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
/**
* 用户: Administrator 项目名:hadoop 时间:2020/10/21 19:46
*/
public class SequenceFileDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//获取配置
Configuration conn = new Configuration();
Job job = Job.getInstance(conn);
//设置jar包存储位置,关联自定义的mapper和reducer
job.setJarByClass(SequenceFileDriver.class);
job.setMapperClass(SequenceFileMapper.class);
job.setReducerClass(SequenceFileReducer.class);
//设置输入的inputFormat
job.setInputFormatClass(MyInputFormat.class);
//设置输出的outputFormat
job.setOutputFormatClass(SequenceFileOutputFormat.class);
//设置map输出端的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
//设置最终输出端的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
//设置路径,包括了输入文件和输出路径
FileInputFormat.setInputPaths(job,new Path("hdfs://bigdata01:9000/input/aa"));
//输出路径的文件夹不能存在,若存在,则报错。
FileOutputFormat.setOutputPath(job,new Path("src/main/resources/output/myformat"));
if(!job.waitForCompletion(true)){
return;
}
}
}
- 结果