重新编译hadoop的wordcount集成ik分词器

最新推荐文章于 2020-06-26 05:14:18 发布

wallfeacers

最新推荐文章于 2020-06-26 05:14:18 发布

阅读量642

点赞数 1

分类专栏： hadoop 文章标签： hadoop-wordcount 集成IK分词器

本文链接：https://blog.csdn.net/qq_27385301/article/details/67631912

版权

hadoop 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1、手动安装ik-analyzers到本地仓库，这是因为maven仓库支持的版本太，高目前支持5.1.0版本，在编译的时候楼主使用的JDK版本是1.8.0_111，而运行的时候使用的JDK 版本是1.7.0_79，这样的话有个版本问题
解决方式一：JDK版本统一（这个没试）
解决方式二：手动安装底版本的ik-analyzers（楼主使用的是这种方式）

第一步：下载ik-analyzers
https://github.com/wks/ik-analyzer（页面中有个pom.xml文件，可以看到ik的版本是3.2.8）

第二步：解压下载好的ik-analyzer-master.zip压缩包，使用CMD（在Windows中）进入到与pom.xml同级目录。
执行命令：mvn package -Dmaven.test.skip=true

2、创建maven工程
pom.xml文件：

<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>


    <groupId>com.wsz.hadoop.wordcount</groupId>
    <artifactId>MyHadoopMRIK</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.6.1</version>
        </dependency>


        <dependency>
            <groupId>commons-cli</groupId>
            <artifactId>commons-cli</artifactId>
            <version>1.2</version>
        </dependency>
        <dependency>
            <groupId>org.wltea.ik-analyzer</groupId>  
            <artifactId>ik-analyzer</artifactId>  
            <version>3.2.8</version> 
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.wsz.hadoop.wordcount.MyHadoopMRIK</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

WordCount类：

package com.wsz.hadoop.wordcount;

import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;

public class MyHadoopMRIK {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        // 这里是输入参数，至少要有输入输出参数，即为otherArgs.length=2
        if (otherArgs.length < 1) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(1);
        }
        Job job = new Job(conf, "MyHadoopMRIK");
        job.setJarByClass(MyHadoopMRIK.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        for (int i = 0; i < otherArgs.length - 1; ++i)
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[(otherArgs.length - 1)]));

        System.exit((job.waitForCompletion(true)) ? 0 : 1);
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        /*
         * private IntWritable result;
         * 
         * public IntSumReducer() { this.result = new IntWritable(); }
         */

        public void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {

            // 1、组合，把各个map中的数据汇总到一个reduce中
            Iterator<IntWritable> iterator = values.iterator();

            int count = 0;

            // 迭代结果集
            while (iterator.hasNext()) {
                IntWritable value = iterator.next();
                count += value.get();
            }

            context.write(key, new IntWritable(count));
        }
    }

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
        /*
         * private static final IntWritable one = new IntWritable(1); private
         * Text word;
         * 
         * public TokenizerMapper() { this.word = new Text(); }
         */
        // <Object, Text, Text, IntWritable> 输入参数，输入值，输出参数，输出值
        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {

            // 1、得到要拆分的内容
            // StringTokenizer tokens = new StringTokenizer(value.toString());
            /*
             * String keyValue = new String(value.getBytes());
             * 
             * // 2、拆词 String[] values = keyValue.split(" "); for (String v :
             * values) { // 向本地map中写入拆分好的词 context.write(new Text(v), new
             * IntWritable(1)); }
             */

            // 1、获取分词内容
            String str = new String(value.getBytes());

            StringReader reader = new StringReader(str);

            // 2、创建中文分词对象
            IKSegmentation ikSegmentation = new IKSegmentation(reader, true);

            Lexeme lexeme = null;

            while((lexeme = ikSegmentation.next()) != null) {
                context.write(new Text(lexeme.getLexemeText()), new IntWritable(1));
            }

        }
    }
}