Hadoop_day05_MapReduce 的入门案例(WordCount:统计单词出现次数)

21 篇文章 0 订阅
8 篇文章 0 订阅

1. 数据格式准备

1.1 创建一个新的文件

cd /export/servers
vim wordcount.txt

1.2 向其中放入以下内容并保存

hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop

1.3 上传到 HDFS

hdfs dfs -mkdir /wordcount/
hdfs dfs -put wordcount.txt /wordcount/

2. 导入依赖

<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                    <!--    <verbal>true</verbal>-->
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <minimizeJar>true</minimizeJar>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

3. Mapper

package com.cpz.wordcount;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable,Text,Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        Text text = new Text();
        LongWritable longWritable = new LongWritable();
        // 将一行的文本数据进行拆分
        String line = value.toString();
        String[] split = line.split(",");

        // 遍历数组,组装 K2 和 V2
        for (String word : split){
            // 将 K2 和 V2 写入上下文
            text.set(word);
            longWritable.set(1);
            context.write(text,longWritable);
        }
    }
}

4. Reducer

package com.cpz.wordcount;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text,LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        long count = 0;
        // 遍历集合,将集合中的数字相加,得到 V3
        for (LongWritable value : values) {
            count += value.get();
        }
        // 将 K3 和 V3 写入上下文中
        context.write(key,new LongWritable(count));
    }
}

5. JobMain

package com.cpz.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class JobMain extends Configured implements Tool {
    // 该方法用于指定一个job任务
    @Override
    public int run(String[] strings) throws Exception {
        // 1.创建一个job任务对象
        Job job = Job.getInstance(super.getConf(), "wordcount");
        // 如果打包运行出错,则需要加该配置
       // job.setJarByClass(JobMain.class);

        // 2.配置job任务对象(8个步骤)
        // 第一步:指定文件的读取方式和读取路径
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/wordcount"));

        // 第二步:指定Map阶段的处理方式
        job.setMapperClass(WordCountMapper.class);
        // 设置Map阶段的输出类型(K2,V2)
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // 第三、四、五、六步采用默认方式


        // 第七步:指定Reduce阶段的处理方式和数据类型
        job.setReducerClass(WordCountReducer.class);
        // 设置K3,V3
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        // 第八步:设置输出类型
        job.setOutputFormatClass(TextOutputFormat.class);
        // 设置输出的路径
        TextOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/wordcount_out"));

        // 目标目录存在问题
        // 获取FileSystem
        FileSystem fileSystem = FileSystem.get(new URI("hdfs://node01:8020"),super.getconf());
        // 判断目录是否存在
        boolean exists = fileSystem.exists(new Path("hdfs://node01:8020/wordcount_out"));
        if (exists) {
            // 删除目标目录
            fileSystem.delete(new Path("hdfs://node01:8020/wordcount_out"),true);
        }

        // 等待任务结束
        boolean bl = job.waitForCompletion(true);
        return bl ? 0 : 1;// 0 正常,1 有误
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        // 启动 job 任务
        int run = ToolRunner.run(configuration, new JobMain(), args);
        System.exit(run);
    }
}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值