IDEA+maven构建hadoopMR开发环境

最新推荐文章于 2022-05-16 19:37:00 发布

zxp209

最新推荐文章于 2022-05-16 19:37:00 发布

阅读量1k

点赞数

分类专栏： JAVA 文章标签： IDEA HADOOP MapReduce

本文链接：https://blog.csdn.net/zxp209/article/details/84911474

版权

JAVA 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

总结一下IDEA+maven构建的开发环境。
我的环境是WIN7(64位） hadoop2.8,3台虚拟机的hadoop集群，两个做datanode，一个nameNode. IDEA是2016.1 ,maven 3.9,java1.7

IDEA+maven 很简单了，跟着引导创建一个新maven项目就可以了。

下面是pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>hadoop.test</groupId>
    <artifactId>hadoop</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <hadoop.version>2.8.0</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>
</project>

首先确保集群开着，运行正常。
然后，把集群上的hadoop包拷贝到windows的目录下，然后创建HADOOP_HOME 环境变量，把bin放到PATH下。
下载window扩展。
[url]http://files.cnblogs.com/files/longshiyVip/hadoop2.6%28x64%29V0.2.zip[/url]
这个版本是2.6 64位，我hadoop2.8用着没有问题。
解压后覆盖到bin目录下，把hadoop.dll放入system32中。相关配置文件拷贝到resource目录下,跟集群上保持一致就可以了。
其中需要log4.properties，不然日志打印不出来。
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.Target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
log4j.rootLogger=INFO, console

开始写代码。
[img]http://dl2.iteye.com/upload/attachment/0127/6567/117a6678-bc92-3510-8dc2-543d3fc71fc8.png[/img]


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class WordCount extends Configured implements Tool {
    public int run(String[] strings) throws Exception {
        try {
            Configuration conf = new Configuration();

            conf.addResource("/core-site.xml");
            conf.addResource("/hdfs-site.xml");
            conf.addResource("/mapred-site.xml");
            conf.addResource("/yarn-site.xml");

            conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");
            conf.set("mapreduce.framework.name", "yarn");
            conf.set("yarn.resourcemanager.hostname", "master128");
            conf.set("fs.defaultFS", "hdfs://master128:9000");
            conf.set("mapreduce.app-submission.cross-platform", "true");

            Job job = Job.getInstance(conf);
            job.setJarByClass(WordCount.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            job.setMapperClass(WcMapper.class);
            job.setReducerClass(WcReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            FileInputFormat.setInputPaths(job, "hdfs://master128:9000/zxq/input");
            FileOutputFormat.setOutputPath(job, new Path("hdfs://master128:9000/zxq/output"));

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return 0;
    }

    public static class WcMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String mVal = value.toString();
            context.write(new Text(mVal), new LongWritable(1));
        }
    }
    public static class WcReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
            long sum = 0;
            for(LongWritable lVal : values){
                sum += lVal.get();
            }
            context.write(key, new LongWritable(sum));
        }
    }
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new WordCount(), args);
    }
}

在java jvm启动参数需要加入hadoop用户名: -DHADOOP_USER_NAME=hadoop 根据自己的实际情况填写，否则会报访问安全问题。
jar包一定要有的
conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");

下面一些配置根据自己的实际填写，主要是主机名(或者ip)端口，输入输出文件。

 Configuration conf = new Configuration();

            conf.addResource("/core-site.xml");
            conf.addResource("/hdfs-site.xml");
            conf.addResource("/mapred-site.xml");
            conf.addResource("/yarn-site.xml");

            conf.set("mapreduce.job.jar", "c:\\study\\java\\hadooptest\\target\\hadoop-1.0-SNAPSHOT.jar");
            conf.set("mapreduce.framework.name", "yarn");
            conf.set("yarn.resourcemanager.hostname", "master128");
            conf.set("fs.defaultFS", "hdfs://master128:9000");
            conf.set("mapreduce.app-submission.cross-platform", "true");

            Job job = Job.getInstance(conf);
            job.setJarByClass(WordCount.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            job.setMapperClass(WcMapper.class);
            job.setReducerClass(WcReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            FileInputFormat.setInputPaths(job, "hdfs://master128:9000/zxq/input");
            FileOutputFormat.setOutputPath(job, new Path("hdfs://master128:9000/zxq/output"));

我写的是绝对路径，也就是mvn clean install生成的jar

我在构建这套环境的时候也遇到了很多问题。由于也是在网上看的文章然后自己实践，发现走了不少坑。
1、windows插件的版本，一定要使用自己hadoop的版本。
2、连接问题，输入，输出文件要带上主机:端口然后再路径，hadoop会截取主机和端口然后访问，nameNode.
3、就是安全访问问题，要hadoop的登录用户，最简单的办法就是加jvm启动参数 -DHADOOP_USER_NAME=hadoop。
网上上还有其他一些方法，比如，修改自己window的用户名和hadoop用户保持一致，亦或者更改hdfs文件的权限。
使用HDFS的命令行接口修改相应目录的权限，hadoop fs -chmod 777 /user,后面的/user是要上传文件的路径，不同的情况可能不一样，比如要上传的文件路径为hdfs://namenode/user/xxx.doc，则这样的修改可以，如果要上传的文件路径为hdfs://namenode/java/xxx.doc，则要修改的为hadoop fs -chmod 777 /java或者hadoop fs -chmod 777 /，java的那个需要先在HDFS里面建立Java目录，后面的这个是为根目录调整权限。

按照上述代码，在input下加入一些文件作为wordcount的输入文件。

hadoop dfs -put wordCount.txt /zxq/input

开始执行。

10:16:09,529  INFO RMProxy:123 - Connecting to ResourceManager at master128/172.23.132.84:8032
10:16:09,786  WARN JobResourceUploader:64 - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
10:16:09,924  INFO FileInputFormat:289 - Total input files to process : 1
10:16:09,980  INFO JobSubmitter:200 - number of splits:1
10:16:10,496  INFO JobSubmitter:289 - Submitting tokens for job: job_1509588776406_0004
10:16:10,674  INFO YarnClientImpl:296 - Submitted application application_1509588776406_0004
10:16:10,699  INFO Job:1345 - The url to track the job: http://master128:8088/proxy/application_1509588776406_0004/
10:16:10,700  INFO Job:1390 - Running job: job_1509588776406_0004
10:16:15,835  INFO Job:1411 - Job job_1509588776406_0004 running in uber mode : false
10:16:15,839  INFO Job:1418 -  map 0% reduce 0%
10:16:21,069  INFO Job:1418 -  map 100% reduce 0%
10:16:26,122  INFO Job:1418 -  map 100% reduce 100%
10:16:26,162  INFO Job:1429 - Job job_1509588776406_0004 completed successfully
10:16:26,286  INFO Job:1436 - Counters: 49
	File System Counters
		FILE: Number of bytes read=363
		FILE: Number of bytes written=273713
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=257
		HDFS: Number of bytes written=162
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters 
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=2508
		Total time spent by all reduces in occupied slots (ms)=2528
		Total time spent by all map tasks (ms)=2508
		Total time spent by all reduce tasks (ms)=2528
		Total vcore-milliseconds taken by all map tasks=2508
		Total vcore-milliseconds taken by all reduce tasks=2528
		Total megabyte-milliseconds taken by all map tasks=2568192
		Total megabyte-milliseconds taken by all reduce tasks=5177344
	Map-Reduce Framework
		Map input records=21
		Map output records=21
		Map output bytes=315
		Map output materialized bytes=363
		Input split bytes=110
		Combine input records=0
		Combine output records=0
		Reduce input groups=18
		Reduce shuffle bytes=363
		Reduce input records=21
		Reduce output records=18
		Spilled Records=42
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=451
		CPU time spent (ms)=2930
		Physical memory (bytes) snapshot=487813120
		Virtual memory (bytes) snapshot=4467601408
		Total committed heap usage (bytes)=455606272
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=147
	File Output Format Counters 
		Bytes Written=162