Hadoop 笔记MuilpleOutputFormat类(将数据写到多个文件中)代码解析

最新推荐文章于 2021-01-12 06:26:10 发布

IT菜籽U

最新推荐文章于 2021-01-12 06:26:10 发布

阅读量657

点赞数

分类专栏： MR

本文链接：https://blog.csdn.net/xiaoshunzi111/article/details/48444995

版权

MR 专栏收录该内容

79 篇文章 2 订阅

订阅专栏

在读出文件中词频时,可以讲统计的词频数据分别放在不同的文件中,从而提高数据的写入效率

思路:

通过Map类中对文件内容进行切分,整理

通过Context上下文类将数据结果输入到shuffle详情参见MapReduce工作原理

shuffle进行分类排序后将结果输入到Reduce中 Reduce去重且处理记录重复数据的个数

通过上下文将结果交给MultipleOutputFormat类处理,此时MultipleOutputFormat收到是键值对即是键为关键字的偏移量值为处理的数据

如为两个文件中数据

文件1

2014-3-2a
2015-4-3b
2015-3-2c
2015-5-2d
2014-3-2a
2015-4-3b
2014-5-4c
2014-6-7d

文件2

2015-3-1a
2015-3-2b
2015-4-3c
2015-5-4d
2014-3-2a
2015-4-3b
2015-5-4c
2015-5-4d

package day0914;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.util.Progressable;

import day0914.StationNametlultip1.MyMap.MyMultipleFilesTextOutputFormat;
import day0914.StationNametlultip1.MyMap.MyReducer;

public class StationNametlultip1 {//主方法类

   public static void main(String[] args) throws Exception {
       //v2
       Configuration conf = new Configuration();
       final JobConf job = new JobConf(conf , StationNametlultip1.class);
       job.setJarByClass(StationNametlultip1.class);
       //map

       job.setMapperClass(MyMap.class);
       job.setReducerClass(MyReducer.class);

       job.setMapOutputKeyClass(Text.class);
       job.setMapOutputValueClass(LongWritable.class);


       job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(LongWritable.class);

       job.setOutputFormat(MyMultipleFilesTextOutputFormat.class);//定义文件输出格式为多文件输出

       FileInputFormat.setInputPaths(job, new Path(args[0]));
       FileOutputFormat.setOutputPath(job, new Path(args[1]));


       JobClient.runJob(job);
   }
//Mapper类
   public static class MyMap extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
       @Override
       public void map(LongWritable key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter)
               throws IOException {
           String line = value.toString();
           final String[] splited = line.split("\t");
           for(String word:splited){
           output.collect(new Text(word),new LongWritable(1));

            }
       }
    //Reduce类
   public static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text,                  LongWritable>{
       @Override
       public void reduce(Text key, Iterator<LongWritable> values,
               OutputCollector<Text, LongWritable> output, Reporter reporter)
               throws IOException {
               long count = 0L;
               while(values.hasNext()) {
                   LongWritable times = values.next();
                   count += times.get();//某关键字所对应的值在文件中出现的次数
               }
               output.collect(key, new LongWritable(count));
       }
   }
    //多文件输出类
   public static class MyMultipleFilesTextOutputFormat extends MultipleOutputFormat<Text, LongWritable>{
       @Override
       protected RecordWriter<Text, LongWritable> getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)throws IOException {

           final TextOutputFormat<Text, LongWritable> textOutputFormat = new TextOutputFormat<Text, LongWritable>();

//四个参数分别为:

//fs 文件系统

// job文件作业

//name数据输出文件的名字

// progress数据生成的进度

           return textOutputFormat.getRecordWriter(fs, job, name, progress);
   }
       @Override
       protected String generateFileNameForKeyValue(Text key,//创建文件
               LongWritable value, String name) {

final String keyString = key.toString();

           if(keyString.startsWith("hello")) {//当读取的文件开始是hello字符串时
               return "hello";//则返回名字为hello文件
           }else {
               return keyString;//否则返回所读文件中key(文件中数据的偏移量)的名字的文件
           }
       }
   }
   }
}

得到结果: