Hadoop Mapreduce

Mapreduce :编程模型

编写MR:

mapper类:

package com.mao.hdfs.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * WCMapper
 */
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Text keyOut = new Text();
        IntWritable valueOut = new IntWritable();
        String[] arr = value.toString().split(" ");
        for (String s: arr){
            keyOut.set(s);
            valueOut.set(1);
            context.write(keyOut,valueOut);
        }
    }
}

reduce类:

package com.mao.hdfs.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * WCReducer
 */
public class WCReducer extends Reducer<Text, IntWritable,Text,IntWritable> {

    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable iw : values){
            count = count + iw.get();
        }
        context.write(key,new IntWritable(count));
    }
}

App类:

package com.mao.hdfs.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class WCApp {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
//        if (args.length>1){
//            FileSystem.get(conf).delete(new Path(args[1]));
//        }
        Job job = Job.getInstance(conf);
        //设置job属性
        job.setJobName("WCApp");                                    //作业名称
        job.setJarByClass(WCApp.class);                             //搜索类
        job.setInputFormatClass(TextInputFormat.class);             //设置输入格式
        job.setOutputFormatClass(SequenceFileOutputFormat.class);   //设置输出格式

        FileInputFormat.addInputPath(job,new Path(args[0]));        //设置输入路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));      //设置输出路径



        //设置最大切片数
        //FileInputFormat.setMaxInputSplitSize(job,13);
        //设置最小切片数
        //FileInputFormat.setMinInputSplitSize(job,1L);

        job.setPartitionerClass(MyPartitioner.class); //设置自定义分区
        job.setCombinerClass(WCReducer.class);   //设置combiner类

        job.setMapperClass(WCMapper.class);                         //mapper类
        job.setReducerClass(WCReducer.class);                       //reduce类

        job.setNumReduceTasks(1);                                   //reduce个数
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.waitForCompletion(true);
    }
}

自定义分区类:

package com.mao.hdfs.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class MyPartitioner extends Partitioner<Text, IntWritable> {
        public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
            return 0;
        }
}


/*
combinerd(合成) 继承了Reducer 任何的Reducer类都能被他使用
-----------------
 Map端的Reducer  预先化简
1,为了减少网络带宽 将Map端发出的的数据进行聚合 并不是所有的都可以用combiner  

*/

Local模式运行MR流程(LocalJobRunner,多线程方式模拟MR)
-------------------------
    1.创建外部Job(mapreduce.Job),设置配置信息
    2.通过jobsubmitter将job.xml + split等文件写入临时目录
    3.通过jobSubmitter提交job给localJobRunner,
    4.LocalJobRunner将外部Job 转换成成内部Job
    5.内部Job线程,开放分线程执行job
    6.job执行线程分别计算Map和reduce任务信息并通过线程池孵化新线程执行MR任务。


在hadoop集群上运行mrjob
-------------------------
    1.导入jar包(在windows生成jar文件)
        安装maven 
    2.丢到hadoop
    3.运行hadoop jar命令
        $>hadoop jar HdfsDemo-1.0-SNAPSHOT.jar com.mm.hdfs.mr.WCApp hdfs://s201/user/centos/wc/data hdfs://s201/user/centos/wc/out 
注:

HdfsDemo-1.0-SNAPSHOT.jar        //要运行的jar文件

com.mm.hdfs.mr.WCApp                 //启动类所在位置

hdfs://s201/user/centos/wc/data      //hadoop 文件夹

hdfs://s201/user/centos/wc/out        //输出文件夹

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值