Java 使用 hadoop 流程

最新推荐文章于 2024-06-30 03:14:56 发布

泰戈尔

最新推荐文章于 2024-06-30 03:14:56 发布

阅读量883

点赞数 1

分类专栏：大数据学习笔记文章标签： hadoop

本文链接：https://blog.csdn.net/marksinoberg/article/details/105452806

版权

大数据学习笔记专栏收录该内容

2 篇文章 0 订阅

订阅专栏

上一篇文章写了如何借助 docker 搭建一套可以简单运行的 Hadoop 集群，搭建好了就可以使用了。

在 hadoop 应用中，最简单的例子应该就是 wordcount 这种类型的了，这次也来走一遍这个流程。

项目搭建

IDEA、Maven 项目

放下 pom.xml 文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>mavenusage</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>commons-beanutils</groupId>
            <artifactId>commons-beanutils</artifactId>
            <version>1.9.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>2.8.5</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.8.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.8.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.8.5</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <mainClass>com.myhadoop.WordCount</mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

项目结构

有一个HDFSConnect.java，这个是用来测试是否可以用代码链接搭建好的 hadoop 集群。文件内容如下：

package com.myhadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;

public class HDFSConnect {
    public static void main(String[] args) throws IOException {
        System.out.println("hello world");
        Configuration conf=new Configuration();
        conf.set("fs.defaultFS","hdfs://localhost:19000");
        FileSystem hdfs = FileSystem.get(conf);
        boolean is_success = hdfs.mkdirs(new Path("/guoruibiaonew"));
        if(is_success){
            System.out.println("success");
        }else{
            System.out.println("failure");
        }
        hdfs.close();
    }
}

这里需要注意的是端口部分，docker 映射到本地是19000，如果打开了防火墙设置，直连 docker 容器内部的话，应该是172.18.0.2:9000。

查看是否创建成功，就可以随便找一台节点，使用如下命令查看即可：

hdfs dfs -ls /

编写代码

编写代码遵循 map ➕ reduce模式即可。

MyMapper.java

package com.myhadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private Text k = new Text();
    private IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] words = value.toString().split(" ");
        for (String word : words) {
            k.set(word);
            context.write(k, v);
        }
    }
}

MyReducer.java

package com.myhadoop;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        v.set(sum);
        context.write(key,v);
    }
}

WordCount.java

package com.myhadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class WordCount {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //构建Configuration实例
        Configuration configuration = new Configuration();
        //其他配置信息

        //获得Job实例
        Job job = Job.getInstance(configuration,"My WordCount Job");
        job.setJarByClass(WordCount.class);

        //设置Mapper和Reducer处理类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        //设置Mapper和Reducer的输入输出格式
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置输出结果的数据格式
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //指定输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //提交任务，true为提交成功，如果为true打印0，为false打印1
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);

    }
}

项目打包

因为本项目使用的是 maven 构建，所以可以很容易的打包。
操作流程是

mvn clean
mvn package

对应到 IDE 里面直接看下图即可。
maven 项目构建

扔到 hadoop 中执行

在执行之前，先随便写点内容，放到 hdfs 上。比如写一个 data.log，文件内容如下：

hello world
hello hadoop
hello tiger
this is a data file.

如果这个文件是在本地编写的，那还需要把文件拷贝到 docker 的 container 中。具体命令为

docker cp /Users/biao/IDEAProjects/mavenusage/data.log 7da3f0644f0f:/tmp

然后再 hadoop-node1 上使用 hdfs 命令将文件上传到 hadoop 的 HDFS 上。

# 如果未创建 hdfs 上的文件目录，需要创建一下，命令如下：
# hdfs dfs -mkdir /guoruibiao
hdfs dfs -put  /tmp/data.log /guoruibiao

查看数据文件是否放到了 hadoop 的 hdfs 上

然后需要注意的是maven 打包好的 jar 文件，也是需要放到数据节点中的，否则执行就会失败，命令如下：

docker cp /Users/biao/IDEAProjects/mavenusage/target/mavenusage-1.0-SNAPSHOT.jar 7da3f0644f0f:/tmp
docker cp /Users/biao/IDEAProjects/mavenusage/target/mavenusage-1.0-SNAPSHOT.jar fe846930210d:/tmp

路径按照自己的来，这里只是做下参考。

do it

万事俱备了，下面正式将 jar 交给 hadoop 去执行。

[root@hadoop-node1 tmp]# hadoop jar /tmp/mavenusage-1.0-SNAPSHOT.jar com.myhadoop.WordCount /guoruibiao/data.log /wordcountoutput
20/04/11 06:52:08 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
20/04/11 06:52:09 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
20/04/11 06:52:10 INFO input.FileInputFormat: Total input files to process : 1
20/04/11 06:52:10 INFO mapreduce.JobSubmitter: number of splits:1
20/04/11 06:52:10 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1586586904355_0003
20/04/11 06:52:11 INFO impl.YarnClientImpl: Submitted application application_1586586904355_0003
20/04/11 06:52:11 INFO mapreduce.Job: The url to track the job: http://hadoop-node1:8088/proxy/application_1586586904355_0003/
20/04/11 06:52:11 INFO mapreduce.Job: Running job: job_1586586904355_0003
20/04/11 06:52:22 INFO mapreduce.Job: Job job_1586586904355_0003 running in uber mode : false
20/04/11 06:52:22 INFO mapreduce.Job:  map 0% reduce 0%
20/04/11 06:52:33 INFO mapreduce.Job:  map 100% reduce 0%
20/04/11 06:52:42 INFO mapreduce.Job:  map 100% reduce 100%
20/04/11 06:52:43 INFO mapreduce.Job: Job job_1586586904355_0003 completed successfully
20/04/11 06:52:43 INFO mapreduce.Job: Counters: 49
	File System Counters
		FILE: Number of bytes read=130
		FILE: Number of bytes written=315797
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=167
		HDFS: Number of bytes written=64
		HDFS: Number of read operations=6
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
	Job Counters
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=8082
		Total time spent by all reduces in occupied slots (ms)=6327
		Total time spent by all map tasks (ms)=8082
		Total time spent by all reduce tasks (ms)=6327
		Total vcore-milliseconds taken by all map tasks=8082
		Total vcore-milliseconds taken by all reduce tasks=6327
		Total megabyte-milliseconds taken by all map tasks=8275968
		Total megabyte-milliseconds taken by all reduce tasks=6478848
	Map-Reduce Framework
		Map input records=4
		Map output records=11
		Map output bytes=102
		Map output materialized bytes=130
		Input split bytes=109
		Combine input records=0
		Combine output records=0
		Reduce input groups=9
		Reduce shuffle bytes=130
		Reduce input records=11
		Reduce output records=9
		Spilled Records=22
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=142
		CPU time spent (ms)=1690
		Physical memory (bytes) snapshot=412508160
		Virtual memory (bytes) snapshot=3884216320
		Total committed heap usage (bytes)=270008320
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters
		Bytes Read=58
	File Output Format Counters
		Bytes Written=64
[root@hadoop-node1 tmp]# hdfs dfs -ls /wordcountoutput
hdfs dFound 2 items
-rw-r--r--   2 root supergroup          0 2020-04-11 06:52 /wordcountoutput/_SUCCESS
-rw-r--r--   2 root supergroup         64 2020-04-11 06:52 /wordcountoutput/part-r-00000
fs[root@hadoop-node1 tmp]# hdfs dfs -cat /wordcountoutput/part-r-00000
a	1
data	1
file.	1
hadoop	1
hello	3
is	1
this	1
tiger	1
world	1
[root@hadoop-node1 tmp]#