前言
该方法是采用HanLP分词原理
参考大佬们的代码与原理:(部分)
simhash算法及原理简介
海量文本用 Simhash, 2小时变4秒! | 文本分析:大规模文本处理(2)
一、计算文本相似度工具类SimHashUtils
package com.siboo.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.math.BigInteger;
import java.sql.Clob;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import com.siboo.utils.StringUtils;
/**
*
* @Title: SimHashUtils.java
* @Package com.newtec.knowGraph.data.params
* @author 陈笑璞
* @date 2020年6月22日 下午5:35:32
* @Description: 计算文本相似度工具类(采用SimHash算法思想)
*
*/
public class SimHashUtils {
private static int hashbits = 64;// 默认64位,即将一个文本转换为64bit数据
private static Pattern CHINES_PATTERIN = Pattern.compile("^[\u4e80-\u9fa5]+$" );
private static final int DWEIGHT = 1;// 默认权重
public SimHashUtils() {
super();
}
public SimHashUtils(int hashbits) {
super();
this.hashbits = hashbits;
}
/**
*
* @Title: splitFourEqual
* @Description: 将Simhash签名值平均分割为4等份
* @param signature 字符串
* @return List<String> 返回分割的4等份字符串
*/
public static List<String> splitFourEqual(String signature) {
int length = signature.length();
int m = 4;
int num = length / m;
List<String> list = new ArrayList<String>();
for (int i = 0; i < m; i++) {
list.add(signature.substring(i*num, (i+1)*num));
}
return list;
}
/**
*
* @Title: ClobToString
* @Description: Java读取Oracle的CLOB字段转换为String数据
* @param clob CLOB字段类型数据
* @throws SQLException
* @throws IOException
* @return String 返回转换为String数据
*/
public static String ClobToString(Clob clob) throws SQLException, IOException {
Reader is = clob.getCharacterStream();// 得到流
BufferedReader br = new BufferedReader(is);
String s = br.readLine();
StringBuffer sb = new StringBuffer();
while (s != null) {
// 执行循环将字符串全部取出付值给StringBuffer由StringBuffer转成String
sb.append(s);
s = br.readLine();
}
if(br!=null){
br