在windows编写MapReduce

最新推荐文章于 2022-03-09 16:58:08 发布

weixin_34274029

最新推荐文章于 2022-03-09 16:58:08 发布

阅读量360

点赞数

文章标签：大数据 python java

原文链接：https://my.oschina.net/905042249/blog/2874917

版权

2019独角兽企业重金招聘Python工程师标准>>>

背景：在centos7.5上装载hadoop-2.8.5集群。现在要在windows编写MR

搭建windows 的hadoop环境

1. 下载与linux集群同个版本的hadoop

1. 配置hadoop环境变量
  1. HADOOP_HOME：D:\hadoop-2.8.5
  2. HADOOP_BIN_PATH：%HADOOP_HOME%\bin
  3. HADOOP_PREFIX：%HADOOP_HOME%
  4. 在Path后面添加;%HADOOP_HOME%\bin;%HADOOP_HOME%\sbin;
2. 解压winutils-master.zip,将文件添加到hadoop_home的bin目录下
3. 查看本地hadoop版本

编写M与R

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
* WCMapper
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Text keyOut = new Text();
        IntWritable valueOut = new IntWritable();
        String[] arr = value.toString().split(" ");
        for(String s : arr){
            keyOut.set(s);
            valueOut.set(1);
            context.write(keyOut,valueOut);
        }
    }
}

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;



import java.io.IOException;



/**

 * Reducer

 */

public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

    /**

     * reduce

     */

    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int count = 0 ;

        for(IntWritable iw : values){

            count = count + iw.get() ;

        }

        String tno = Thread.currentThread().getName();

        System.out.println(tno + " : WCReducer :" + key.toString() + "=" + count);

        context.write(key,new IntWritable(count));

    }

}

项目的pom.xml依赖

<dependencies>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-common</artifactId>
    <version>2.8.5</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
    <version>2.8.5</version>
</dependency>
<dependency>
    <groupId>commons-cli</groupId>
    <artifactId>commons-cli</artifactId>
    <version>1.2</version>
</dependency>
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.11</version>
    <scope>test</scope>
</dependency>
</dependencies>

添加hadoop版本文件夹下的share文件

编写job类

1. 使用本地hdfs系统

 import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;



/**

 *

 */

public class WCApp {

    public static void main(String[] args) throws Exception {

//这个使用的是window的本地，没用到linux集群

        Configuration conf = new Configuration();



        conf.set("fs.defaultFS", "file:///");



        Job job = Job.getInstance(conf);



        //设置job的各种属性

        job.setJobName("WCApp");                        //作业名称

        job.setJarByClass(WCApp.class);                 //搜索类

        job.setInputFormatClass(TextInputFormat.class); //设置输入格式



        //设置输出格式类

        //job.setOutputFormatClass(SequenceFileOutputFormat.class);



        //添加输入路径

        FileInputFormat.addInputPath(job,new Path(args[0]));

        //设置输出路径

        FileOutputFormat.setOutputPath(job,new Path(args[1]));



        //设置最大切片数

        //FileInputFormat.setMaxInputSplitSize(job,13);

        //最小切片数

        //FileInputFormat.setMinInputSplitSize(job,1L);



        //设置分区类

        job.setPartitionerClass(MyPartitioner.class);   //设置自定义分区



        //设置合成类

        job.setCombinerClass(WCReducer.class);          //设置combiner类



        job.setMapperClass(WCMapper.class);             //mapper类

        job.setReducerClass(WCReducer.class);           //reducer类



        job.setNumReduceTasks(3);                       //reduce个数



        job.setMapOutputKeyClass(Text.class);           //

        job.setMapOutputValueClass(IntWritable.class);  //



        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);     //



        job.waitForCompletion(true);

    }

}

注意：这边的args[0] 与args[1]是运行参数

1. 使用linux集群hdfs系统
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class RunJob { public static void main(String[] args){ Configuration conf=new Configuration(); try{ Job job = Job.getInstance(conf, "word count"); job.setJarByClass(RunJob.class); job.setMapperClass(MyMap.class); job.setReducerClass(MyReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //解决No job jar file set. User classes may not be found. See Job or Job#setJar(String)报错的问题 job.setJar("E:\\bigData\\workspace\\testMR\\out\\artifacts\\MyMR\\MyMR.jar"); FileInputFormat.addInputPath(job,new Path(args[0])); FileSystem fs= FileSystem.get(conf); Path op1=new Path(args[1]); if(fs.exists(op1)){ fs.delete(op1, true); System.out.println("存在此输出路径，已删除！！！"); } FileOutputFormat.setOutputPath(job,op1); System.exit(job.waitForCompletion(true)?0:1); }catch (Exception e){ e.printStackTrace(); } }