一 相关 pom 包
<!-- 工具包 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<!-- bean实体注解工具包 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<!-- 汉语言包,主要用于分词 -->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.6.5</version>
</dependency>
二 相关类
1 分词工具类
package similarity;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.util.List;
import java.util.stream.Collectors;
/**
* 中文分词工具类*/
public class Tokenizer {
/**
* 分词*/
public static List<Word> segment(String sentence) {
// 1 采用 HanLP 中文自然语言处理中标准分词进行分词
List<Term> termList = HanLP.segment(sentence);
// 2 重新封装到 Word 对象中(term.word代表分词后的词语,term.nature代表改词的词性)
return termList.stream().map(term -> new Word(term.word, term.nature.toString())).collect(Collectors.toList());
}
}
2 封装分词结果
package similarity;
import lombok.Data;
import java.util.Objects;
/**
* 封装分词结果 */
@Data
public class Word implements Comparable {
// 词名
private String name;
// 词性
private String pos;
// 权重,用于词向量分析
private Float weight;
public Word(String name, String pos) {
this.name = name;
this.pos = pos;
}
@Override
public int hashCode() {
return Objects.hashCode(this.name);
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Word other = (Word) obj;
return Objects.equals(this.name, other.name);
}
@Override
public String toString() {
StringBuilder str = new StringBuilder();
if (name != null) {
str.append(name);
}
if (pos != null) {
str.append("/").append(pos);
}
return str.toString();
}
@Override
public int compareTo(Object o) {
if (this == o) {
return 0;
}
if (this.name == null) {
return -1;