【Hadoop】MapReduce案例——词频统计

一、前期准备

1.新建maven项目
2.pom.xml中添加项目依赖

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.hdtrain</groupId>
  <artifactId>wordcount</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>wordcount</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <!--修改jdk版本-->
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
      <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>4.11</version>
          <scope>test</scope>
      </dependency>
      <!--日志依赖-->
      <dependency>
          <groupId>log4j</groupId>
          <artifactId>log4j</artifactId>
          <version>1.2.17</version>
      </dependency>
      <!--hadoop-hdfs-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>2.7.1</version>
      </dependency>
      <!--hadoop-commmon-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>2.7.1</version>
      </dependency>
      <!--hadoop-client-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.7.1</version>
      </dependency>
      <!--hadoop-mapreduce-->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-core</artifactId>
          <version>2.7.1</version>
      </dependency>
      <!--IK分词器依赖-->
      <dependency>
          <groupId>com.janeluo</groupId>
          <artifactId>ikanalyzer</artifactId>
          <version>2012_u6</version>
      </dependency>
  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

3.项目中添加resource文件夹
添加配置文件:core-site.xml、hdfs-site.xml、mapred-site.xml
修改文件夹文件结构为resource

二、WordCount案例

采用”哈利波特“英文版作为数据
1.WordCountJob.class

package com.hdtrain;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

// 定义wordcount任务
public class WordCountJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		//System.setProperty("HADOOP_USER_NAME", "root");
        //读取配置文件
        Configuration configuration = new Configuration(true);
        configuration.set("mapreduce.framework.name", "local");
        //创建job
        Job job = Job.getInstance(configuration);
        //设置Job的参数
        job.setJobName("wordcount-" + System.currentTimeMillis()); //设置job名
        job.setJarByClass(WordCountJob.class); //设置当前job主类
        job.setNumReduceTasks(2);

        //设置要处理文件的路径
        FileInputFormat.setInputPaths(job, "/data/harry.txt");
        //设置输出结果路径
        FileOutputFormat.setOutputPath(job, new Path("/results/wordcount-"+System.currentTimeMillis()));

        //设置map要输出的数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置map类
        job.setMapperClass(WordCountMapper.class);
        //设置reduce类
        job.setReducerClass(WordCountReduce.class);

        //提交任务
        job.waitForCompletion(true);
    }
}

2.WordCountMapper.class

package com.hdtrain;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取单词
        String[] words = value.toString().replaceAll("[^a-zA-Z0-9\\s']", "").split(" ");
        开始向context添加数据,写出到reduce,统计单词数量
        for (int i = 0; i < words.length; i++){
            context.write(new Text(words[i]), new IntWritable(1));
        }
    }
}

3.WordCountReducer.class

package com.hdtrain;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //声明一个变量存放总数
        long count = 0;
        //获取迭代器
        Iterator<IntWritable> iterator = values.iterator();
        //开始遍历迭代器
        while (iterator.hasNext()){
            int value = iterator.next().get();
            count += value;
        }
        //继续写出
        context.write(key, new LongWritable(count));
    }
}

4.计算结果
在这里插入图片描述

三、射雕英雄传词频案例

采用”哈利波特“英文版作为数据
1.SdxyzJob.class

package com.hdtrain;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import sun.net.sdp.SdpSupport;

import java.io.IOException;

public class SdyxzJob {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.读取配置文件
        Configuration configuration = new Configuration(true);
        configuration.set("mapreduce.framework.name", "local");

        //2.创建job
        Job job = Job.getInstance(configuration);

        //3.设置job的参数
        job.setJobName("射雕英雄传-"+System.currentTimeMillis());
        job.setJarByClass(SdyxzJob.class);
        job.setNumReduceTasks(2);

        //4.设置要处理数据文件的路径
        FileInputFormat.setInputPaths(job, new Path("/data/sdyxz.txt"));

        //5.设置输出结果路径
        FileOutputFormat.setOutputPath(job, new Path("/results/sdyxz-"+System.currentTimeMillis()));

        //6.设置map要输出的数据类型
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //7.设置map类
        job.setMapperClass(SdyxzMapper.class);

        //8.设置reduce类
        job.setReducerClass(SdyxzReducer.class);

        //9.提交job
        job.waitForCompletion(true);
    }
}

2.SdxyzMapper.class

package com.hdtrain;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;

public class SdyxzMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        StringReader stringReader = new StringReader(value.toString());
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
        Lexeme lexeme = null;
        while((lexeme = ikSegmenter.next()) != null){
            context.write(new Text(lexeme.getLexemeText()), new IntWritable(1));
        }
    }
}

3.SdxyzReducer.class

package com.hdtrain;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class SdyxzReducer extends Reducer<Text, IntWritable, Text, LongWritable> {

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        long count = 0;
        Iterator<IntWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            int value = iterator.next().get();
            count += value;
        }
        context.write(key, new LongWritable(count));
    }
}

4.IK分词器示例

package com.hdtrain;

import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;

public class IKword {
    public static void main(String[] args) throws IOException {

        StringReader stringReader = new StringReader("畔一排数十株乌柏树,叶子似火烧般红,正是八月天时。村前村后的野草刚起始变黄,一抹斜阳映照之下,更增了几分萧索。两株大松树下围着一堆村民,男男女女和十几个小孩,正自聚精会神的听着一个瘦削的老者说话。");
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
        Lexeme lexeme = null;
        while((lexeme = ikSegmenter.next()) != null){
            System.out.println(lexeme.getLexemeText());
        }
    }
}

5.计算结果
在这里插入图片描述

  • 1
    点赞
  • 43
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值