一、自定义Mapper
import java.io.IOException;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* <b>合并小文件 Mapper<br>
* </b>
* Text, BytesWritable, Text, Text
* 输入Key类型,输入Value类型,输出Key类型,输出Value类型
*/
public class DecomposeFileMapper extends Mapper<Text, BytesWritable, Text, Text> {
@Override
protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
// 字节转文本
String content = new String(value.copyBytes());
// 文件名称
String fileName = key.toString();
// TODO 可以根据需要处理content文本
context.write(new Text(fileName), new Text(content));
}
}
二、自定义Reducer
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DecomposeFileReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(key, values.iterator().next());
}
}
三、执行MR
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DecomposeFileDriver {
public static void main(String[] args) throws Exception {
// 输入输出路径需要根据自己电脑上实际的输入输出路径设置
String inputDir = "e:/output1";
String outputDir = "e:/output2";
process(inputDir, outputDir);
}
/**
* 执行hdfs
* @param inputDir 输入文件夹地址
* @param outputDir 输出文件夹地址
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
private static void process(String inputDir, String outputDir) throws IOException, InterruptedException, ClassNotFoundException {
// 1 获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 设置jar包存储位置、关联自定义的mapper和reducer
job.setJarByClass(DecomposeFileDriver.class);
job.setMapperClass(DecomposeFileMapper.class);
job.setReducerClass(DecomposeFileReducer.class);
// 7设置输入的inputFormat
job.setInputFormatClass(SequenceFileInputFormat.class);
// 3 设置map输出端的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 4 设置最终输出端的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 5 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(inputDir));
FileOutputFormat.setOutputPath(job, new Path(outputDir));
// 6 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
注意点:
要使用value.copyBytes()
不要写成value.getBytes()