在读出文件中词频时,可以讲统计的词频数据分别放在不同的文件中,从而提高数据的写入效率
思路:
通过Map类中对文件内容进行切分,整理
通过Context上下文类将数据结果输入到shuffle详情参见MapReduce工作原理
shuffle进行分类排序后将结果输入到Reduce中 Reduce去重且处理记录重复数据的个数
通过上下文将结果交给MultipleOutputFormat类处理,此时MultipleOutputFormat收到是键值对即是键为关键字的偏移量 值为处理的数据
如为两个文件中数据
文件1
2014-3-2a
2015-4-3b
2015-3-2c
2015-5-2d
2014-3-2a
2015-4-3b
2014-5-4c
2014-6-7d
文件2
2015-3-1a
2015-3-2b
2015-4-3c
2015-5-4d
2014-3-2a
2015-4-3b
2015-5-4c
2015-5-4d
package day0914;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.util.Progressable;
import day0914.StationNametlultip1.MyMap.MyMultipleFilesTextOutputFormat;
import day0914.StationNametlultip1.MyMap.MyReducer;
public class StationNametlultip1 {//主方法类
public static void main(String[] args) throws Exception {
//v2
Configuration conf = new Configuration();
final JobConf job = new JobConf(conf , StationNametlultip1.class);
job.setJarByClass(StationNametlultip1.class);
//map
job.setMapperClass(MyMap.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setOutputFormat(MyMultipleFilesTextOutputFormat.class);//定义文件输出格式为多文件输出
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
}
//Mapper类
public static class MyMap extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
@Override
public void map(LongWritable key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
final String[] splited = line.split("\t");
for(String word:splited){
output.collect(new Text(word),new LongWritable(1));
}
}
//Reduce类
public static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{
@Override
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
long count = 0L;
while(values.hasNext()) {
LongWritable times = values.next();
count += times.get();//某关键字所对应的值在文件中出现的次数
}
output.collect(key, new LongWritable(count));
}
}
//多文件输出类
public static class MyMultipleFilesTextOutputFormat extends MultipleOutputFormat<Text, LongWritable>{
@Override
protected RecordWriter<Text, LongWritable> getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)throws IOException {
final TextOutputFormat<Text, LongWritable> textOutputFormat = new TextOutputFormat<Text, LongWritable>();
//四个参数 分别为:
//fs 文件系统
// job文件作业
//name数据输出文件的名字
// progress数据生成的进度
return textOutputFormat.getRecordWriter(fs, job, name, progress);}
@Override
protected String generateFileNameForKeyValue(Text key,//创建文件
LongWritable value, String name) {
final String keyString = key.toString();
return "hello";//则返回名字为hello文件
}else {
return keyString;//否则返回所读文件中key(文件中数据的偏移量)的名字的文件
}
}
}
}
}
得到结果:
多文件中内容: