mapreduce练习：小说《我的前半生》文本信息统计分词统计业务，集群执行任务

最新推荐文章于 2023-03-28 17:06:01 发布

weixin_41044499

最新推荐文章于 2023-03-28 17:06:01 发布

阅读量283

点赞数

文章标签： hdfs hadoop mapreduce

本文链接：https://blog.csdn.net/weixin_41044499/article/details/95042341

版权

mapreduce练习集群执行任务

参考之前的朴素贝叶斯新闻分类练习 https://blog.csdn.net/weixin_41044499/article/details/94591422

精确统计不同主题下的分词的频率对朴素贝叶斯新闻的分类非常重要。

这里借助mapreduce的练习，离线统计任务实现大量数据的词频统计，可以用于后续文本做朴素贝叶斯统计。

加载IKAnalyzer做分词，计算小说《我的前半生》的word-count，计算1，2-gram统计。

文本如下：

将文件上传至hdfs，/data/input/half.txt。文件编码为UTF-8，防止中文乱码。

文件加载依赖包：

IKAnalyzer分词包

<groupId>com.janeluo</groupId>

<artifactId>ikanalyzer</artifactId>

</dependency>

hadoop包

<dependencies>

    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.7.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.7.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.7.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-core</artifactId>
      <version>2.7.4</version>
    </dependency>

  </dependencies>

用于打包的maven插件：

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.3</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

代码如下：

package com.jianan.hadoop.wordcount2gram;

import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException{
   // 1 获取配置信息，或者job对象实例
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJar("/root/apps/1.jar");
job.setJarByClass(WordCount.class);
// 2 指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);

// 3 指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);

// 4 指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

// 5 指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path("/data/input/half.txt"));
FileOutputFormat.setOutputPath(job, new Path("/data/output1"));

boolean result = job.waitForCompletion(true);
System.exit(result?0:1);

}
public static class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
   /**
   * map阶段的业务逻辑就写在自定义的map()方法中
   * maptask会对每一行输入数据调用一次我们自定义的map（）方法
   */
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

   // 1 将maptask传给我们的文本内容先转换成String
   String line = new String(value.toString().trim().getBytes(),"UTF-8");
   if(!(line.equals(null)||line.equals("\n"))) {
       String result2 = "";
       StringReader sr = new StringReader(line);//读取字符串
       IKSegmenter ik = new IKSegmenter(sr, true);
       Lexeme lex = null;//分词之后的词语
       String prefix = "";
       try {
           while((lex = ik.next())!=null){
               result2 = lex.getLexemeText();
               if(!(prefix.equals(null)||prefix=="")) {
                   context.write(new Text(prefix+" "+result2), new IntWritable(1));
               }
               context.write(new Text(result2), new IntWritable(1));
               prefix = result2;
           }
       } catch (IOException e) {
           e.printStackTrace();
       }
   }
   }
}

   public static class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

   /**
   * key，是一组相同单词kv对的key
   */
   @Override
   protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

int count = 0;

   // 1 汇总各个key的个数
   for(IntWritable value:values){
   count +=value.get();
   }

   // 2输出该key的总次数
   context.write(key, new IntWritable(count));
   }
   }
}

集群运行模式

hadoop jar wordcount.jar com.jianan.hadoop.wordcount2gram.WordCountDriver

对数据进行局部聚合处理，也就是combiner处理。键相等的键值对会调用一次reduce方法。经过这一阶段，数据量会减少。本阶段默认是没有的。按照一定的规则对第三阶段输出的键值对进行分区。默认是只有一个区。分区的数量就是Reducer任务运行的数量。默认只有一个Reducer任务。

统计后的结果如下：

我们发现，惊讶的结果，小说中“唐晶”是所有小说任务中出现次数最高的，之后才是涓生、安儿、子君。玲玲的次数相比之下则很少。也反映出闺蜜情深的主要剧情设置。