hadoop学习笔记MapReduce

关于MapReduce的java代码开发,模式比较固定;基本分为三个部分,Mapper、Reducer以及主入口配置;已经典的词频统计案例来说步骤如下

1 Mapper代码:

package com.hadoop.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Created by 86186 on 2019/8/31.
 * KEYIN:Map任务读取数据的key类型,offset,是每行数据起始位置的偏移量,Long
 * LongWritable 输入的类型
 * Text 输入的内容
 * Text 输出类型
 * IntWritable 输出内容
 */
public class WordCountMapper  extends Mapper<LongWritable,Text,Text,IntWritable>{

    /**
     * 
     * @param key 输入类型
     * @param value 输入内容-这里的value 是整个文本经过split之后的内容,可以理解为每一行的内容;
     * @param context 输出上下文
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //读取value
        String[]words=value.toString().split("\\PL+");//切割单词
        for(String word:words){
            word=word.toLowerCase();
            context.write(new Text(word),new IntWritable(1));//
        }
    }
}

2 Reducer代码:

package com.hadoop.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;

/**
 * Created by 86186 on 2019/8/31.
 * Text: Reducer输入类型-对应Mapper 输出
 * IntWritable:Reducer输入内容-对应Mapper 输出
 * Text: Reducer输出类型
 * IntWritable:Reducer输出内容
 */
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

    /**
     *
     * @param key  输入类型 与 Mapper输出对应
     * @param values 输入内容  是经过 shuffle之后的内容
     * @param context 上下文
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        /**
         * 这里的values 已经是根据 key 汇总好的了。只需要做一个累加即可
         */
        int count=0;
        Iterator<IntWritable>iterator=values.iterator();
        while(iterator.hasNext()){
            IntWritable value=iterator.next();
            count+=value.get();
        }
        System.out.println("key:"+key.toString());
        context.write(key,new IntWritable(count));
    }
}

3 入口配置

package com.hadoop.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * Created by 86186 on 2019/8/31.
 */
public class WordCountApp {

    public static void main(String[] args) throws Exception{
        System.setProperty("HADOOP_USER_NAME","hadoop");//用户
        Configuration  configuration =new Configuration();
        configuration.set("fs.defaultFS","hdfs://192.168.0.120:8020");//url
        //创建一个Job
        Job job =Job.getInstance(configuration);
        job.setJarByClass(WordCountApp.class);//配置JarClass
        job.setMapperClass(WordCountMapper.class);//配置Mapper
        job.setReducerClass(WordCountReducer.class);//配置Reducer

        job.setMapOutputKeyClass(Text.class);//配置Mapper输出 key
        job.setMapOutputValueClass(IntWritable.class);//配置Mapper输出 value

        job.setOutputKeyClass(Text.class);//配置Reducer输出 key
        job.setOutputValueClass(IntWritable.class);//配置Reducer输出 value

       /*
        输出文件如果存在,先删除;否则会报错
         */
       FileSystem fileSystem=  FileSystem.get(new URI("hdfs://192.168.0.120:8020"),configuration,"hadoop");
        Path outputPath=new Path("/wordcount/output");
        if(fileSystem.exists(outputPath)){
            fileSystem.delete(outputPath,true);
        }

        FileInputFormat.setInputPaths(job,new Path("/wordcount/input/h.txt"));//配置输入文件
        FileOutputFormat.setOutputPath(job,new Path("/wordcount/output"));//配置输出文件
        
        boolean result=job.waitForCompletion(true);//运行
        System.exit(result ? 0:-1);

    }
}

3.1 查看h.txt文件内容;

3.2 运行程序,查看执行结果:

4 如果只是测试MapperReducer功能;也可以使用本地环境,不需要配置HDFS环境;

代码如下(只需要修改入口代码即可)

package com.hadoop.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by 86186 on 2019/8/31.
 */
public class WordCountAppLocal {

    public static void main(String[] args) throws Exception{
        Configuration  configuration =new Configuration();
        //创建一个Job
        Job job =Job.getInstance(configuration);
        job.setJarByClass(WordCountAppLocal.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job,new Path("D:\\F\\hadoop\\mr\\input\\a.txt"));
        FileOutputFormat.setOutputPath(job,new Path("D:\\F\\hadoop\\mr\\out5"));

        boolean result=job.waitForCompletion(true);
        System.exit(result ? 0:-1);

    }
}

4.1 文件内容:

4.2 执行结果:

5 总结,基本流程

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值