java-mapreduce实现wordcount(可指定src,dst)

mr完整程序分三个阶段
0.准备
pom.xml引入jar包

<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <spring.version>4.3.13.RELEASE</spring.version>
    <hadoop.version>3.1.3</hadoop.version>
</properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.0.2</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <classpathPrefix>lib/</classpathPrefix>
                         <!-- 该处存放所需打包的类中,main方法所在的位置 -->
                         <mainClass>me.test.mapreduce.mr.WordCountDriver</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
        </plugins>
    </build>
    

1.mapper阶段

package me.test.mapreduce.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 这里是mr程序mapper阶段业务逻辑实现类
 *  Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 *	KEYIN, VALUEIN : 输入的数据中,key和value的数据类型,本此案例中,输入的key是一个个单词,value是数字,因此key:Text   value:IntWritable
 *	KEYOUT, VALUEOUT:输出的数据中,key和value的数据类型。将切割好的单词发送给reducer处理,key:Text   value:IntWritable
 * 平时所说的数据类型(string long integer)都是jdk自带的类型,在序列化时效率低下(因为不单单只是数据),因此hadoop有一套自己的数据类型
 * long -Longwritable
 * string-text
 * Integer-Intwritable
 * null-Nullwritable
 */
//public class WordCountMapper  extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public class WordCountMapper  extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     *  mapper节阶段具体实现方法,该方法调用取决于读取数据的组件有没有给mr传入数据,每当读取数据组件读取到一个《k,v》,该方法调用一次
     * @param key
     * @param value
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //拿到传进来啊的一行数据,将数据转化为string
        String line = value.toString();

        //切割此行内容,划分为一个个单词
        String [] words = line.split(" ");

        //遍历words,每个单词标记一次(<word,1>)
        for (String word: words) {
            //使用mr程序上下文context,将mapper阶段处理的数据发送出去,作为reduce阶段的处理数据
            context.write(new Text(word), new IntWritable(1));


        }
        


    }
}

2.reducer阶段

package me.test.mapreduce.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 *
 * 这里是reducer处理阶段,接受从mapper阶段传送过来的数据
 * 输入:单词:Text, 值:IntWritable
 */
public class WordCountReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
    /**
     * 接受所有传进来的参数之后,按照key的字典序进行排序,之后按key为参数分组,对这一组所有的value作为一个迭代器传入reduce方法
     *
     * 大概数据流程:
     * 传进数据:<hadoop,1><spark,1><hello,1><wordcount,1><hadoop,1><hadoop,1>
     * 进行排序:<hadoop,1><hadoop,1><hello,1><sparl,1><wordcount,1><hadoop,1>
     * 传入一组数据:<hadoop,1><hadoop,1><hadoop,1>
     *     reduce进行计算:<hadoop,[1,1,1]>
     * 再次传入数据:<hello,1>
     * 。。。。。。。。
     */

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //定义一个计算器
        int count = 0;
        //遍历一组迭代器
        for (IntWritable value: values) {
            count = count + value.get();
        }
        context.write(key,new IntWritable(count));
    }
}

3.运行主类

package me.test.mapreduce.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 这个类就是mr程序运行时主类,本类中组装了一些程序运行时所需要的信息
 * 比如:输入数据在哪里,输出数据在哪里,使用的是 哪个mapper类,是哪个reducer类
 */
public class WordCountDriver extends Configured implements Tool {

    public static String INPUT = "";
    public static String OUTPUT = "";

    @Override
    public int run(String[] strings) throws Exception {
        //配置输入输入路径
        INPUT = strings[0];
        OUTPUT = strings[1];
        Configuration conf = new Configuration();
        //判断输出路径是否存在
        Path output = new Path(OUTPUT);
        if (output.getFileSystem(conf).exists(output)){
            output.getFileSystem(conf).delete(output,true);
            System.out.println("输出路径存在,已自动删除。");
        }


        //通过job来封装本次mr相关信息
        Job job = Job.getInstance(conf,WordCountDriver.class.getSimpleName());
        //指定运行主类
        job.setJarByClass(WordCountDriver.class);
        FileInputFormat.setInputPaths(job,INPUT);
        //指定mr所用的mapper与Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReduce.class);

        //map输出阶段k,v类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //reducer输出k,v类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //指定本次mr输入文件和输出文件路径
		//注意此处jar包的引用:org.apache.hadoop.mapreduce.lib.input/output.FileInputFormat;
        FileInputFormat.setInputPaths(job,INPUT);
        FileOutputFormat.setOutputPath(job,output);
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new WordCountDriver(), args);
    }

}

4.打包上传,测试运行
在这里插入图片描述
上传至hdfs,测试运行

[root@master ~]# hadoop jar  maven-hadoop-test1-2.0.jar /abcabcabc.txt /output
输出路径存在,已自动删除。
------------------
[root@master ~]# hadoop fs -ls -R /output
-rw-r--r--   1 root supergroup          0 2020-03-23 20:42 /output/_SUCCESS
-rw-r--r--   1 root supergroup         43 2020-03-23 20:42 /output/part-r-00000
[root@master ~]# hadoop fs -cat /output/part-r-00000
2020-03-23 20:43:24,182 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
aasasadadas     1
adasda  1
as      3
asd     3
d       3
sd      1
[root@master ~]# 
发布了7 篇原创文章 · 获赞 0 · 访问量 272
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览