mapreduce

MapReduce复习

 

 

 ①MapReduce词频统计

WordCountMapper

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)throws IOException, InterruptedException {

        String line = value.toString();

        String[] words = line.split(" ");

        for (String word : words) {

        context.write(new Text(word), new IntWritable(1));

        }

    }

}

WordCountDriver.java

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountDriver {

    public static void main(String[] args) throws Exception {

        // 通过Job来封装本次MR的相关信息

        Configuration conf = new Configuration();

        conf.set(“fs.defaultFS”, “hdfs://localhost:9000");

        Job wcjob = Job.getInstance(conf);

        // 指定MR Job jar包运行主类

    wcjob.setJarByClass(WordCountDriver.class);

        // 指定本次MR所有的Mapper Reducer类

    wcjob.setMapperClass(WordCountMapper.class);

    wcjob.setReducerClass(WordCountReducer.class);

        // 设置我们的业务逻辑 Mapper类的输出 key和 value的数据类型

    wcjob.setMapOutputKeyClass(Text.class);

    wcjob.setMapOutputValueClass(IntWritable.class);

        // 设置我们的业务逻辑 Reducer类的输出 key和 value的数据类型

    wcjob.setOutputKeyClass(Text.class);

    wcjob.setOutputValueClass(IntWritable.class);

        //设置Combiner组件

    wcjob.setCombinerClass(WordCountCombiner.class);

       

        // 指定要处理的数据所在的位置

    FileInputFormat.setInputPaths(wcjob, “input");

        // 指定处理完成之后的结果所保存的位置

    FileOutputFormat.setOutputPath(wcjob, new Path("output"));

        // 提交程序并且监控打印程序执行情况

        boolean res = wcjob.waitForCompletion(true);

        System.exit(res ? 0 : 1);

    }

}

 

或:

WordCountMapper

package cipintongji;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final IntWritable one = new IntWritable(1);

        private Text word = new Text();

        @Override

        public void map(LongWritable key, Text value, Context context)

                                throws IOException, InterruptedException {

            String line = value.toString();

            StringTokenizer token = new StringTokenizer(line);

            while (token.hasMoreTokens()) {

                word.set(token.nextToken());

                context.write(word, one);

            }

        }

}

WordCountReducer

package cipintongji;

import java.io.IOException;

import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override

        public void reduce(Text key, Iterable<IntWritable> values, Context context)

                            throws IOException, InterruptedException {

            int sum = 0;

            for (IntWritable val : values) {

                sum += val.get();

            }

            context.write(key, new IntWritable(sum));

        }

}

WordCountDriver

package yeagetion1;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.examples.WordCount.TokenizerMapper;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;

public class WordCountDriver {

    public static void main(String[] args) throws Exception {

        final String hdfsurl = "hdfs://localhost:9000";

        // 组织一个job,并提交

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf,"word count");

        job.setJarByClass(WordCountDriver.class);

        job.setMapperClass(TokenizerMapper.class);

        job.setCombinerClass(IntSumReducer.class);

        job.setReducerClass(IntSumReducer.class);

        // 如果map输出的中间结果类型,与reduce输出的结果类型相同时,可省略map的输出类型设置

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        // 指定要处理的输入数据文件的路径,执行时传入的第一个参数指定

        FileInputFormat.addInputPath(job, new Path(hdfsurl+"/input/"));

        // 指定最后reducer输出结果保存的文件路径,执行时传入的第二个参数指定

        FileOutputFormat.setOutputPath(job, new Path(hdfsurl+"/out"));          

        // 参数true:是否在控制台打印执行过程的详细信息       

        boolean flag = job.waitForCompletion(false);

        System.exit(flag?0:1);

    }

}

②MapReduce数据去重

DedupMapper.java

package mr.dedup;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

public class DedupMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    private static Text field = new Text();

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        field = value;

        context.write(field, NullWritable.get());

    }

}

DedupReducer.java

package mr.dedup;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class DedupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

    protected void reduce(Text key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {

        context.write(key, NullWritable.get());

    }

}

DedupRunner.java

package mr.dedup;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DedupRunner {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

    conf.set(“fs.defaultFS”,”hdfs://localhost:9000”);

        Job job = Job.getInstance(conf);

    job.setJarByClass(DedupRunner.class);

    job.setMapperClass(DedupMapper.class);

    job.setReducerClass(DedupReducer.class);

    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job, new Path(“/dedup/input"));

    FileOutputFormat.setOutputPath(job, new Path("/dedup/ output"));

       job.waitForCompletion(true);

    }

}

③MapReduce实现倒排索引

InvertedIndexMapper.java

package mr.InvertedIndex;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class InvertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {

    private static Text keyInfo = new Text();// 存储单词和 URL 组合

    private static final Text valueInfo = new Text("1");// 存储词频,初始化为1

    @Override

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();

        String[] fields = line.split(" ");// 得到字段数组

        FileSplit fileSplit = (FileSplit) context.getInputSplit();// 得到这行数据所在的文件切片

        String fileName = fileSplit.getPath().getName();// 根据文件切片得到文件名

        for (String field : fields) {

            // key值由单词和URL组成,如“MapReduce:file1”

            keyInfo.set(field + ":" + fileName);

            context.write(keyInfo, valueInfo);

        }

    }

}

InvertedIndexCombiner.java

package mr.InvertedIndex;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class InvertedIndexCombiner extends Reducer<Text, Text, Text, Text> {

    private static Text info = new Text();

    // 输入: <MapReduce:file3 {1,1,...}>

    // 输出:<MapReduce file3:2>

    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

        int sum = 0;// 统计词频

        for (Text value : values) {

            sum += Integer.parseInt(value.toString());

        }

        int splitIndex = key.toString().indexOf(":");

        // 重新设置 value 值由 URL 和词频组成

    info.set(key.toString().substring(splitIndex + 1) + ":" + sum);

        // 重新设置 key 值为单词

    key.set(key.toString().substring(0, splitIndex));

        context.write(key, info);

    }

}

InvertedIndexReducer.java

package mr.InvertedIndex;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {       

    private static Text result = new Text(); 

    // 输入:<MapReduce file3:2> 

    // 输出:<MapReduce file1:1;file2:1;file3:2;> 

    @Override

    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { 

        // 生成文档列表 

        String fileList = new String(); 

        for (Text value : values) { 

            fileList += value.toString() + ";"; 

        } 

        result.set(fileList); 

        context.write(key, result); 

    } 

}

InvertedIndexRunner.java

package mr.InvertedIndex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class InvertedIndexRunner { 

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 

    Configuration conf = new Configuration(); 

conf.set(“fs.defaultFS”,”hdfs://localhost:9000”);

    Job job = Job.getInstance(conf); 

job.setJarByClass(InvertedIndexRunner.class);  job.setMapperClass(InvertedIndexMapper.class); 

job.setCombinerClass(InvertedIndexCombiner.class); 

job.setReducerClass(InvertedIndexReducer.class); 

   

job.setOutputKeyClass(Text.class); 

job.setOutputValueClass(Text.class); 

 

FileInputFormat.setInputPaths(job, new Path(“/InvertedIndex/input"));

    // 指定处理完成之后的结果所保存的位置 FileOutputFormat.setOutputPath(job, new Path(“/InvertedIndex/output"));

    // 向 hadoop集群提交这个 job

    boolean res = job.waitForCompletion(true);

    System.exit(res ? 0 : 1);

    } 

}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值