文本相似度去重,在大量的文本中找到相似的文本进行去重。
整体去重流程 采用三层去重处理分别是 url,title,content。url,title 都采用 MD5去重(这个so easy ),content 采用simhash+海明距离 去重(也就是 文本相似度的计算)。
1、对content 计算 simhash 得到64位的字符串
1) 全角转半角
2) 繁体转换简体
3) 去除各类标签和特殊字符
4) 文本分词处理(采用的hanlp分词,其他自行查找)
5) 词频计算
6) 去除停用词
7) 计算词语权重
8) 根据词语和权重值,生成由0和1组成的64位simhash
2、查找计算汉明距离,判断文本是否重复,判断标准两个文本海明距离(海明距离可以理解为两个simhash有多少位不一样)小于等于3就认为是重复文本
1)将64位切成4段,每段16位,作为key,然后在redis查找 。
a) key存在,找到值,遍历所有值,计算海明距离小于等于3就认为是重复的文本。遍历完毕不存在海明距离小于等于3,说明不是重复文本 ,
进行 2)将值存入到redis
b) key不存在,说明不是重复文本 ,进行 2)将值存入到redis
如果两个相似文本,海明距离最大为3,最多有3位不一样,最多分布在3个段内,4个段至少有1个段是完全一致的。(可以根据自己的需求设计海明距离,和分段数,目的主要是对大量数据进行过滤)
2)将16位的4段分别作为key 存到redis 中,可以采用 list 类型(16位的段作为key,simhash 作为list中的元素),也可以采用zset(本文中采用zset,16位的段作为key,simhash 作为zset中的value,然后再设计一个score 用于过滤)。
理论说完,开始上货。
maven 引入hanlp
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.5</version>
</dependency>
上工具类 可以直接拿走
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.util.*;
import java.util.regex.Pattern;
public class SimHashUtil {
private static Logger log = LoggerFactory.getLogger(SimHashUtil.class);
private static Set<String> stopWordsSet = new HashSet<>();
private static Map<String, Double> idfMap = new HashMap<String, Double>();
private static double idfAverage = loadIdfDict(idfMap);
static Pattern pattern = Pattern.compile("([0-9]*)|([0-9]*[年|月|日])");
/**
* 判断word是否无效,
*
* @param word
* @return true:无效
*/
private static boolean isInvalid(String word) {
if (word.length() < 2) {
return true;
}
return pattern.matcher(word).matches();
}
/**
* 获取 64位的 simHash 值
*
* @param textContent
* @return
*/
public static String getSimHashStr(String textContent) {
textContent = preprocessData(textContent);
//利用hanlp进行分词
List<Term> segment = HanLP.segment(textContent);
Map<String, Integer> wordMap = new HashMap<>();
Map<String, Double> tfidfMap = new HashMap<>();
//加载停用词
if (CollectionUtils.isEmpty(stopWordsSet)) {
loadStopWords();
}
if (CollectionUtils.isNotEmpty(segment)) {
//去除停用词
for (Term term : segment) {
String word = term.word.replace(" ", "");
boolean contains = stopWordsSet.contains(term.word);
if (contains || isInvalid(word)) {
continue;
}
if (wordMap.containsKey(word)) {
wordMap.put(word, wordMap.get(word) + 1);
} else {
wordMap.put(word, 1);
}
}
}
//计算词语权重
wordMap.forEach((k, v) -> {
if (idfMap.containsKey(k)) {
double idf = v * idfMap.get(k);
tfidfMap.put(k, idf);
} else {
double idf = v * idfAverage;
tfidfMap.put(k, idf);
}
});
return analysisSimHash(tfidfMap);
}
/**
* 预处理数据
*
* @param textContent
* @return
*/
public static String preprocessData(String textContent) {
if (StringUtils.isBlank(textContent)) {
return textContent;
}
//全角转半角
textContent = CharUtil.ToDBC(textContent);
//繁体转换简体
textContent = HanLP.convertToSimplifiedChinese(textContent);
//去除各类标签和特殊字符
textContent = removeTag(textContent);
return textContent;
}
/**
* 去除标签 和特殊字符
*
* @param text
* @return
*/
private static String removeTag(String text) {
if (null == text || text.isEmpty()) {
return "";
}
text = text.replaceAll("[`|+|•|/|<|>|《|》|_|"|·|。|“|”|「|\"|」|:|:|.|。|,|.|;|\\-|?|!|,|;|?|!|\t|\\[|\\]|(|)|{|}|【|】|(|)|||\\|、|]", "");
text = text.replaceAll("[#|…]", "").replaceAll("" >", "")
.replaceAll("\\s+", " ")
.replaceAll("[^\u4E00-\u9FA5]", "");//去除emoji图像;
text = text.replaceAll("\\s+", "").replaceAll(" ", "").replaceAll(" ", "");
text = text.replaceAll("\\s+", "").replaceAll(" +", "").replaceAll("\\u2003", "")
.replaceAll(" ", "").replaceAll("[\\s*|\t|\r|\n|\r\n|]", "").replaceAll(" ", "").replaceAll("nbsp", "");
text = text.replaceAll("[\u007f-\u009f]|\u00ad|[\u0483-\u0489]|[\u0559-\u055a]|\u058a|"
+ "[\u0591-\u05bd]|\u05bf|[\u05c1-\u05c2]|[\u05c4-\u05c7]|[\u0606-\u060a]|[\u063b-\u063f]|\u0674|"
+ "[\u06e5-\u06e6]|\u070f|[\u076e-\u077f]|\u0a51|\u0a75|\u0b44|[\u0b62-\u0b63]|[\u0c62-\u0c63]|"
+ "[\u0ce2-\u0ce3]|[\u0d62-\u0d63]|\u135f|[\u200b-\u200f]|[\u2028-\u202e]|\u2044|\u2071|[\uf701-\uf70e]|"
+ "[\uf710-\uf71a]|\ufb1e|[\ufc5e-\ufc62]|\ufeff|\ufffc", "");
text = text.replace("0", "").replace("1", "")
.replace("2", "").replace("3", "").replace("4", "")
.replace("5", "").replace("6", "").replace("7", "")
.replace("8", "").replace("9", "").toLowerCase().trim();
text = text.replace("0", "").replace("1", "")
.replace("2", "").replace("3", "").replace("4", "")
.replace("5", "").replace("6", "").replace("7", "")
.replace("8", "").replace("9", "").toLowerCase().trim();
return text;
}
/**
* 加载词频语料库 用于计算 词语权重
* 词频语料库 链接 https://download.csdn.net/download/qq_31286957/85926178
* @param idfMap
* @return
*/
private static double loadIdfDict(Map<String, Double> idfMap) {
InputStreamReader in;
long st1 = System.currentTimeMillis();
double idf = 0.0;
double idfSum = 0.0;
int lineno = 0;
String[] arrStrings = null;
String line = null;
try {
in = new InputStreamReader(new FileInputStream("data/idfold.utf8"), "UTF-8");//语料库可以自行去找中文词频语料库,内容是 词语和权重
BufferedReader bf = new BufferedReader(in);
while ((line = bf.readLine()) != null) {
if (line.isEmpty()) {
continue;
}
arrStrings = line.split(" ");
if (arrStrings.length != 2) {
continue;
}
idf = Double.valueOf(arrStrings[1]);
idfMap.put(arrStrings[0], idf);
idfSum += idf;
lineno++;
}
} catch (NumberFormatException e) {
e.printStackTrace();
log.error("数据格式错误:" + e.getMessage());
} catch (IOException e) {
e.printStackTrace();
log.error("IO错误:" + e.getMessage());
} catch (Exception e) {
e.printStackTrace();
log.error("读取不到加载idf语料词典: " + e.toString());
}
long st2 = System.currentTimeMillis();
return idfSum / lineno;
}
/**
* 加载停用词
* 停用词 链接 https://download.csdn.net/download/qq_31286957/85926178
* @param idfMap
* @return
*/
private static void loadStopWords() {
InputStreamReader in;
long st1 = System.currentTimeMillis();
String line = null;
try {
in = new InputStreamReader(new FileInputStream("data/stopword.dic"), "UTF-8");//停用词可以自己设置
BufferedReader bf = new BufferedReader(in);
while ((line = bf.readLine()) != null) {
if (line.isEmpty()) {
continue;
}
stopWordsSet.add(line);
}
} catch (NumberFormatException e) {
e.printStackTrace();
log.error("数据格式错误:" + e.getMessage());
} catch (IOException e) {
e.printStackTrace();
log.error("IO错误:" + e.getMessage());
} catch (Exception e) {
e.printStackTrace();
log.error("读取不到文件: " + e.toString());
}
long st2 = System.currentTimeMillis();
log.info("加载 stopword消耗时间: " + (st2 - st1) + "ms");
}
/**
* 根据所有词频和权重获取一个64位的hash值
*
* @param wordInfos
* @return
*/
private static String analysisSimHash(Map<String, Double> wordInfos) {
double[] featureVector = new double[FNVHash.HASH_BITS];
Set<String> words = wordInfos.keySet();
// System.out.println(words);
for (String word : words) {
BigInteger wordhash = FNVHash.fnv1aHash64(word);
//获取每一位的hash值是0还是1,使用与该位的1与的操作,节约时间
for (int i = 0; i < FNVHash.HASH_BITS; i++) {
BigInteger bitmask = BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1);
if (wordhash.and(bitmask).signum() != 0) {
featureVector[i] += wordInfos.get(word);
} else {
featureVector[i] -= wordInfos.get(word);
}
}
}
StringBuffer hashBuffer = new StringBuffer();
for (int i = 0; i < FNVHash.HASH_BITS; i++) {
if (featureVector[i] >= 0) {
hashBuffer.append("1");
} else {
hashBuffer.append("0");
}
}
return hashBuffer.toString();
}
/**
* 计算两个字符串的汉明距离
*
* @param a
* @param b
* @return
*/
public static int hammingDistance(String a, String b) {
if (a == null || b == null) {
return 0;
}
if (a.length() != b.length()) {
return 0;
}
int disCount = 0;
for (int i = 0; i < a.length(); i++) {
if (a.charAt(i) != b.charAt(i)) {
disCount++;
}
}
return disCount;
}
/**
* 获取 simhash 中 1 的个数
*
* @param simhash
* @return
*/
public static int getOneNumFromSimHash(String simhash) {
try {
if (simhash.length() != 64) {
return -1;
}
int score = 0;
for (int i = 0; i < simhash.length(); i++) {
if ((Integer.parseInt(String.valueOf(simhash.charAt(i)))) == 1) {
score++;
}
}
return score;
} catch (Exception e) {
return -1;
}
}
}
import java.math.BigInteger;
/**
* FNVhash
*/
public final class FNVHash {
public static final int HASH_BITS = 64;
public static final BigInteger FNV_64_INIT = new BigInteger("14695981039346656037");
public static final BigInteger FNV_64_PRIME = new BigInteger("1099511628211");
public static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE);
/**
* fnv-1 hash算法,将字符串转换为64位hash值
*
* @param str str
* @return
*/
public static BigInteger fnv1Hash64(String str) {
BigInteger hash = FNV_64_INIT;
int len = str.length();
for (int i = 0; i < len; i++) {
hash = hash.multiply(FNV_64_PRIME);
hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
}
hash = hash.and(MASK_64);
return hash;
}
/**
* fnv-1a hash算法,将字符串转换为64位hash值
*
* @param str str
* @return
*/
public static BigInteger fnv1aHash64(String str) {
BigInteger hash = FNV_64_INIT;
int len = str.length();
for (int i = 0; i < len; i++) {
hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
hash = hash.multiply(FNV_64_PRIME);
}
hash = hash.and(MASK_64);
return hash;
}
}
/**
* 字符工具类
*/
public class CharUtil {
/**
* 全角转半角字符
*
* @param input
* @return
*/
public static String ToDBC(String input) {
char[] c = input.toCharArray();
for (int i = 0; i < c.length; i++) {
if (c[i] == 12288) {
//全角空格为12288,半角空格为32
c[i] = (char) 32;
continue;
}
if (c[i] > 65280 && c[i] < 65375)
//其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
{
c[i] = (char) (c[i] - 65248);
}
}
return new String(c);
}
/**
* 半角转全角
*
* @param input
* @return
*/
public static String ToSBC(String input) {
//半角转全角:
char[] c = input.toCharArray();
for (int i = 0; i < c.length; i++) {
if (c[i] == 32) {
c[i] = (char) 12288;
continue;
}
if (c[i] < 127) {
c[i] = (char) (c[i] + 65248);
}
}
return new String(c);
}
调用使用示例
/**
* 增加判重逻辑 内容
*
* @return true 重复数据 false 不重复
*/
private boolean isDataRepeat(String content) {
boolean startStatus = false;
String simHashStr = SimHashUtil.getSimHashStr(content);
if (StringUtils.isNotBlank(simHashStr) && simHashStr.length() == 64) {
String strKey = "";
String strKeyPrefix = "SIMHASH_";
for (int i = 0; i < 4; i++) {
strKey = strKeyPrefix + simHashStr.substring(i * 16, (i + 1) * 16);
//将后三位置0
long currentTimeMillis = System.currentTimeMillis() / 1000 * 1000;
Boolean hasKey = redisTemplate.hasKey(strKey);
//计算simhash中1的个数
int scoreBySimHash = SimHashUtil.getOneNumFromSimHash(simHashStr);
// +100目的是避免 1的个数少于3 减去3 后变成负数
long base = scoreBySimHash + 100L;
int maxDistance = 3;
//查询7天内的数据
long begin = currentTimeMillis - 7 * 24 * 60 * 60 * 1000L;
long end = base + maxDistance + currentTimeMillis;
ZSetOperations zSetOperations = redisTemplate.opsForZSet();
//删除七天前的数据
zSetOperations.removeRangeByScore(strKey,0,begin);
if (hasKey) {
Set<DefaultTypedTuple> values = zSetOperations.rangeByScoreWithScores(strKey, begin, end);
for (DefaultTypedTuple tuple : values) {
String str = tuple.getValue().toString();
long score = tuple.getScore().longValue();
//取余获取需要比较的1的个数
long remainder = score % 1000;
//判断1个数的范围
//海明距离是位数值不一样的个数,如果海明距离小于等于3,那么两个simhash中1的个数最多相差3,进一步删选掉一部分数据,然后计算海明距离
if (remainder >= (base - maxDistance) && remainder <= (base + maxDistance)) {
int hammingDistance = SimHashUtil.hammingDistance(str, simHashStr);
if (hammingDistance <= maxDistance) {
return true;
}
}
}
}
zSetOperations.add(strKey, simHashStr, currentTimeMillis + base);
redisTemplate.expire(strKey, 7, TimeUnit.DAYS);
}
}
return startStatus;
}