1、手动安装ik-analyzers到本地仓库,这是因为maven仓库支持的版本太,高目前支持5.1.0版本,在编译的时候楼主使用的JDK版本是1.8.0_111,而运行的时候使用的JDK 版本是1.7.0_79,这样的话有个版本问题
解决方式一:JDK版本统一(这个没试)
解决方式二:手动安装底版本的ik-analyzers(楼主使用的是这种方式)
第一步:下载ik-analyzers
https://github.com/wks/ik-analyzer(页面中有个pom.xml文件,可以看到ik的版本是3.2.8)
第二步:解压下载好的ik-analyzer-master.zip压缩包,使用CMD(在Windows中)进入到与pom.xml同级目录。
执行命令:mvn package -Dmaven.test.skip=true
2、创建maven工程
pom.xml文件:
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wsz.hadoop.wordcount</groupId>
<artifactId>MyHadoopMRIK</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.1</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.wltea.ik-analyzer</groupId>
<artifactId>ik-analyzer</artifactId>
<version>3.2.8</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.wsz.hadoop.wordcount.MyHadoopMRIK</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
WordCount类:
package com.wsz.hadoop.wordcount;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
public class MyHadoopMRIK {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// 这里是输入参数,至少要有输入输出参数,即为otherArgs.length=2
if (otherArgs.length < 1) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(1);
}
Job job = new Job(conf, "MyHadoopMRIK");
job.setJarByClass(MyHadoopMRIK.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i)
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[(otherArgs.length - 1)]));
System.exit((job.waitForCompletion(true)) ? 0 : 1);
}
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/*
* private IntWritable result;
*
* public IntSumReducer() { this.result = new IntWritable(); }
*/
public void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// 1、组合,把各个map中的数据汇总到一个reduce中
Iterator<IntWritable> iterator = values.iterator();
int count = 0;
// 迭代结果集
while (iterator.hasNext()) {
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
/*
* private static final IntWritable one = new IntWritable(1); private
* Text word;
*
* public TokenizerMapper() { this.word = new Text(); }
*/
// <Object, Text, Text, IntWritable> 输入参数,输入值,输出参数,输出值
public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// 1、得到要拆分的内容
// StringTokenizer tokens = new StringTokenizer(value.toString());
/*
* String keyValue = new String(value.getBytes());
*
* // 2、拆词 String[] values = keyValue.split(" "); for (String v :
* values) { // 向本地map中写入拆分好的词 context.write(new Text(v), new
* IntWritable(1)); }
*/
// 1、获取分词内容
String str = new String(value.getBytes());
StringReader reader = new StringReader(str);
// 2、创建中文分词对象
IKSegmentation ikSegmentation = new IKSegmentation(reader, true);
Lexeme lexeme = null;
while((lexeme = ikSegmentation.next()) != null) {
context.write(new Text(lexeme.getLexemeText()), new IntWritable(1));
}
}
}
}