【记一次大量文本相似度去重】

往日不在

已于 2022-07-07 17:13:09 修改

阅读量1k

点赞数

文章标签： java

于 2022-07-06 10:55:57 首次发布

本文链接：https://blog.csdn.net/qq_31286957/article/details/125626319

版权

     文本相似度去重，在大量的文本中找到相似的文本进行去重。
     整体去重流程 采用三层去重处理分别是 url，title，content。url，title 都采用 MD5去重（这个so easy ），content 采用simhash+海明距离 去重（也就是 文本相似度的计算）。
    1、对content 计算 simhash 得到64位的字符串
     1) 全角转半角
     2) 繁体转换简体
     3) 去除各类标签和特殊字符
     4) 文本分词处理(采用的hanlp分词，其他自行查找)
     5) 词频计算
     6) 去除停用词
     7) 计算词语权重
     8) 根据词语和权重值，生成由0和1组成的64位simhash
    2、查找计算汉明距离，判断文本是否重复，判断标准两个文本海明距离（海明距离可以理解为两个simhash有多少位不一样）小于等于3就认为是重复文本
     1)将64位切成4段，每段16位，作为key，然后在redis查找 。
     		a) key存在，找到值，遍历所有值，计算海明距离小于等于3就认为是重复的文本。遍历完毕不存在海明距离小于等于3，说明不是重复文本 ，
     进行 2）将值存入到redis
    		b) key不存在，说明不是重复文本 ，进行 2）将值存入到redis
     如果两个相似文本，海明距离最大为3，最多有3位不一样，最多分布在3个段内，4个段至少有1个段是完全一致的。(可以根据自己的需求设计海明距离，和分段数，目的主要是对大量数据进行过滤)
     2)将16位的4段分别作为key 存到redis 中，可以采用 list 类型（16位的段作为key，simhash 作为list中的元素），也可以采用zset（本文中采用zset，16位的段作为key，simhash 作为zset中的value，然后再设计一个score 用于过滤）。
 理论说完，开始上货。

maven 引入hanlp
 <dependency>
            <groupId>com.hankcs</groupId>
            <artifactId>hanlp</artifactId>
            <version>portable-1.7.5</version>
        </dependency>

上工具类可以直接拿走

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.util.*;
import java.util.regex.Pattern;

public class SimHashUtil {
    private static Logger log = LoggerFactory.getLogger(SimHashUtil.class);
    private static Set<String> stopWordsSet = new HashSet<>();
    private static Map<String, Double> idfMap = new HashMap<String, Double>();
    private static double idfAverage = loadIdfDict(idfMap);
    static Pattern pattern = Pattern.compile("([0-9]*)|([0-9]*[年|月|日])");
    /**
     * 判断word是否无效，
     *
     * @param word
     * @return true:无效
     */
    private static boolean isInvalid(String word) {
        if (word.length() < 2) {
            return true;
        }
        return pattern.matcher(word).matches();

    }

    /**
     * 获取 64位的 simHash 值
     *
     * @param textContent
     * @return
     */
    public static String getSimHashStr(String textContent) {
        textContent = preprocessData(textContent);
        //利用hanlp进行分词
        List<Term> segment = HanLP.segment(textContent);
        Map<String, Integer> wordMap = new HashMap<>();
        Map<String, Double> tfidfMap = new HashMap<>();
        //加载停用词
        if (CollectionUtils.isEmpty(stopWordsSet)) {
            loadStopWords();
        }
        if (CollectionUtils.isNotEmpty(segment)) {
        	//去除停用词
            for (Term term : segment) {
                String word = term.word.replace(" ", "");
                boolean contains = stopWordsSet.contains(term.word);
                if (contains || isInvalid(word)) {
                    continue;
                }
                if (wordMap.containsKey(word)) {
                    wordMap.put(word, wordMap.get(word) + 1);
                } else {
                    wordMap.put(word, 1);
                }
            }
        }
        //计算词语权重
        wordMap.forEach((k, v) -> {
            if (idfMap.containsKey(k)) {
                double idf = v * idfMap.get(k);
                tfidfMap.put(k, idf);
            } else {
                double idf = v * idfAverage;
                tfidfMap.put(k, idf);
            }
        });
        return analysisSimHash(tfidfMap);
    }
    /**
     * 预处理数据
     *
     * @param textContent
     * @return
     */
    public static String preprocessData(String textContent) {
        if (StringUtils.isBlank(textContent)) {
            return textContent;
        }
        //全角转半角
        textContent = CharUtil.ToDBC(textContent);
        //繁体转换简体
        textContent = HanLP.convertToSimplifiedChinese(textContent);
        //去除各类标签和特殊字符
        textContent = removeTag(textContent);
        return textContent;
    }


    /**
     * 去除标签 和特殊字符
     *
     * @param text
     * @return
     */
    private static String removeTag(String text) {
        if (null == text || text.isEmpty()) {
            return "";
        }
        text = text.replaceAll("[`|+|•|/|<|>|《|》|_|＂|·|。|“|”|「|\"|」|:|：|.|。|，|．|；|\\-|？|！|,|;|?|!|\t|\\[|\\]|(|)|{|}|【|】|（|）|｜|\\|、|]", "");
        text = text.replaceAll("[#|…]", "").replaceAll("&quot &gt", "")
                .replaceAll("\\s+", " ")
                .replaceAll("[^\u4E00-\u9FA5]", "");//去除emoji图像;
        text = text.replaceAll("\\s+", "").replaceAll(" ", "").replaceAll("　", "");
        text = text.replaceAll("\\s+", "").replaceAll(" +", "").replaceAll("\\u2003", "")
                .replaceAll(" ", "").replaceAll("[\\s*|\t|\r|\n|\r\n|]", "").replaceAll("&nbsp;", "").replaceAll("nbsp", "");
        text = text.replaceAll("[\u007f-\u009f]|\u00ad|[\u0483-\u0489]|[\u0559-\u055a]|\u058a|"
                + "[\u0591-\u05bd]|\u05bf|[\u05c1-\u05c2]|[\u05c4-\u05c7]|[\u0606-\u060a]|[\u063b-\u063f]|\u0674|"
                + "[\u06e5-\u06e6]|\u070f|[\u076e-\u077f]|\u0a51|\u0a75|\u0b44|[\u0b62-\u0b63]|[\u0c62-\u0c63]|"
                + "[\u0ce2-\u0ce3]|[\u0d62-\u0d63]|\u135f|[\u200b-\u200f]|[\u2028-\u202e]|\u2044|\u2071|[\uf701-\uf70e]|"
                + "[\uf710-\uf71a]|\ufb1e|[\ufc5e-\ufc62]|\ufeff|\ufffc", "");
        text = text.replace("０", "").replace("１", "")
                .replace("２", "").replace("３", "").replace("４", "")
                .replace("５", "").replace("６", "").replace("７", "")
                .replace("８", "").replace("９", "").toLowerCase().trim();
        text = text.replace("0", "").replace("1", "")
                .replace("2", "").replace("3", "").replace("4", "")
                .replace("5", "").replace("6", "").replace("7", "")
                .replace("8", "").replace("9", "").toLowerCase().trim();
        return text;
    }

    /**
     * 加载词频语料库 用于计算 词语权重
     *   词频语料库 链接 https://download.csdn.net/download/qq_31286957/85926178
     * @param idfMap
     * @return
     */
    private static double loadIdfDict(Map<String, Double> idfMap) {
        InputStreamReader in;
        long st1 = System.currentTimeMillis();
        double idf = 0.0;
        double idfSum = 0.0;
        int lineno = 0;
        String[] arrStrings = null;
        String line = null;
        try {
            in = new InputStreamReader(new FileInputStream("data/idfold.utf8"), "UTF-8");//语料库可以自行去找中文词频语料库，内容是 词语和权重
            BufferedReader bf = new BufferedReader(in);
            while ((line = bf.readLine()) != null) {
                if (line.isEmpty()) {
                    continue;
                }
                arrStrings = line.split(" ");
                if (arrStrings.length != 2) {
                    continue;
                }
                idf = Double.valueOf(arrStrings[1]);
                idfMap.put(arrStrings[0], idf);
                idfSum += idf;
                lineno++;
            }
        } catch (NumberFormatException e) {
            e.printStackTrace();
            log.error("数据格式错误：" + e.getMessage());

        } catch (IOException e) {
            e.printStackTrace();
            log.error("IO错误：" + e.getMessage());
        } catch (Exception e) {
            e.printStackTrace();
            log.error("读取不到加载idf语料词典: " + e.toString());
        }
        long st2 = System.currentTimeMillis();
        return idfSum / lineno;
    }
	/**
     * 加载停用词
     *   停用词 链接 https://download.csdn.net/download/qq_31286957/85926178
     * @param idfMap
     * @return
     */
    private static void loadStopWords() {
        InputStreamReader in;
        long st1 = System.currentTimeMillis();
        String line = null;
        try {
         in = new InputStreamReader(new FileInputStream("data/stopword.dic"), "UTF-8");//停用词可以自己设置
            BufferedReader bf = new BufferedReader(in);
            while ((line = bf.readLine()) != null) {
                if (line.isEmpty()) {
                    continue;
                }
                stopWordsSet.add(line);
            }
        } catch (NumberFormatException e) {
            e.printStackTrace();
            log.error("数据格式错误：" + e.getMessage());
        } catch (IOException e) {
            e.printStackTrace();
            log.error("IO错误：" + e.getMessage());
        } catch (Exception e) {
            e.printStackTrace();
            log.error("读取不到文件: " + e.toString());
        }
        long st2 = System.currentTimeMillis();
        log.info("加载 stopword消耗时间: " + (st2 - st1) + "ms");
    }


    /**
     * 根据所有词频和权重获取一个64位的hash值
     *
     * @param wordInfos
     * @return
     */

    private static String analysisSimHash(Map<String, Double> wordInfos) {
        double[] featureVector = new double[FNVHash.HASH_BITS];
        Set<String> words = wordInfos.keySet();
        // System.out.println(words);
        for (String word : words) {
            BigInteger wordhash = FNVHash.fnv1aHash64(word);
            //获取每一位的hash值是0还是1，使用与该位的1与的操作，节约时间
            for (int i = 0; i < FNVHash.HASH_BITS; i++) {
                BigInteger bitmask = BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1);
                if (wordhash.and(bitmask).signum() != 0) {
                    featureVector[i] += wordInfos.get(word);
                } else {
                    featureVector[i] -= wordInfos.get(word);
                }
            }
        }
        StringBuffer hashBuffer = new StringBuffer();
        for (int i = 0; i < FNVHash.HASH_BITS; i++) {
            if (featureVector[i] >= 0) {
                hashBuffer.append("1");
            } else {
                hashBuffer.append("0");
            }
        }
        return hashBuffer.toString();
    }

    /**
     * 计算两个字符串的汉明距离
     *
     * @param a
     * @param b
     * @return
     */
    public static int hammingDistance(String a, String b) {
        if (a == null || b == null) {
            return 0;
        }
        if (a.length() != b.length()) {
            return 0;
        }
        int disCount = 0;
        for (int i = 0; i < a.length(); i++) {
            if (a.charAt(i) != b.charAt(i)) {
                disCount++;
            }
        }
        return disCount;
    }
   /**
     * 获取 simhash 中 1 的个数
     *
     * @param simhash
     * @return
     */
    public static int getOneNumFromSimHash(String simhash) {
        try {
            if (simhash.length() != 64) {
                return -1;
            }
            int score = 0;
            for (int i = 0; i < simhash.length(); i++) {
                if ((Integer.parseInt(String.valueOf(simhash.charAt(i)))) == 1) {
                    score++;
                }
            }
            return score;
        } catch (Exception e) {
            return -1;
        }
    }

}

import java.math.BigInteger;

/**
 * FNVhash
 */
public final class FNVHash {
    public static final int HASH_BITS = 64;
    public static final BigInteger FNV_64_INIT = new BigInteger("14695981039346656037");
    public static final BigInteger FNV_64_PRIME = new BigInteger("1099511628211");
    public static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE);
    /**
     * fnv-1 hash算法，将字符串转换为64位hash值
     *
     * @param str str
     * @return
     */
    public static BigInteger fnv1Hash64(String str) {
        BigInteger hash = FNV_64_INIT;
        int len = str.length();
        for (int i = 0; i < len; i++) {
            hash = hash.multiply(FNV_64_PRIME);
            hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
        }
        hash = hash.and(MASK_64);
        return hash;
    }

    /**
     * fnv-1a hash算法，将字符串转换为64位hash值
     *
     * @param str str
     * @return
     */
    public static BigInteger fnv1aHash64(String str) {
        BigInteger hash = FNV_64_INIT;
        int len = str.length();
        for (int i = 0; i < len; i++) {
            hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
            hash = hash.multiply(FNV_64_PRIME);
        }
        hash = hash.and(MASK_64);
        return hash;
    }
}


/**
 * 字符工具类
 */
public class CharUtil {

    /**
     * 全角转半角字符
     *
     * @param input
     * @return
     */

    public static String ToDBC(String input) {
        char[] c = input.toCharArray();
        for (int i = 0; i < c.length; i++) {
            if (c[i] == 12288) {
                //全角空格为12288，半角空格为32
                c[i] = (char) 32;
                continue;
            }
            if (c[i] > 65280 && c[i] < 65375)
            //其他字符半角(33-126)与全角(65281-65374)的对应关系是：均相差65248
            {
                c[i] = (char) (c[i] - 65248);
            }
        }
        return new String(c);
    }

    /**
     * 半角转全角
     *
     * @param input
     * @return
     */

    public static String ToSBC(String input) {
        //半角转全角：
        char[] c = input.toCharArray();
        for (int i = 0; i < c.length; i++) {
            if (c[i] == 32) {
                c[i] = (char) 12288;
                continue;
            }
            if (c[i] < 127) {
                c[i] = (char) (c[i] + 65248);
            }
        }
        return new String(c);
    }

调用使用示例

    /**
     * 增加判重逻辑 内容
     *
     * @return true  重复数据   false 不重复
     */
    private boolean isDataRepeat(String content) {
        boolean startStatus = false;
        String simHashStr = SimHashUtil.getSimHashStr(content);
        if (StringUtils.isNotBlank(simHashStr) && simHashStr.length() == 64) {
            String strKey = "";
            String strKeyPrefix = "SIMHASH_";
            for (int i = 0; i < 4; i++) {
                strKey = strKeyPrefix + simHashStr.substring(i * 16, (i + 1) * 16);
                //将后三位置0
                long currentTimeMillis = System.currentTimeMillis() / 1000 * 1000;
                Boolean hasKey = redisTemplate.hasKey(strKey);
                //计算simhash中1的个数
                int scoreBySimHash = SimHashUtil.getOneNumFromSimHash(simHashStr);
                // +100目的是避免 1的个数少于3 减去3 后变成负数
                long base = scoreBySimHash + 100L;
                int maxDistance = 3;
                //查询7天内的数据
                long begin =  currentTimeMillis - 7 * 24 * 60 * 60 * 1000L;
                long end = base + maxDistance + currentTimeMillis;
                ZSetOperations zSetOperations = redisTemplate.opsForZSet();
                 //删除七天前的数据
                zSetOperations.removeRangeByScore(strKey,0,begin);
                if (hasKey) {
                    Set<DefaultTypedTuple> values = zSetOperations.rangeByScoreWithScores(strKey, begin, end);
                    for (DefaultTypedTuple tuple : values) {
                        String str = tuple.getValue().toString();
                        long score = tuple.getScore().longValue();
                        //取余获取需要比较的1的个数
                        long remainder = score % 1000;
                        //判断1个数的范围
                        //海明距离是位数值不一样的个数，如果海明距离小于等于3，那么两个simhash中1的个数最多相差3，进一步删选掉一部分数据，然后计算海明距离
                        if (remainder >= (base - maxDistance) && remainder <= (base + maxDistance)) {
                            int hammingDistance = SimHashUtil.hammingDistance(str, simHashStr);
                            if (hammingDistance <= maxDistance) {
                                return true;
                            }
                        }
                    }
                }
                zSetOperations.add(strKey, simHashStr, currentTimeMillis + base);
                redisTemplate.expire(strKey, 7, TimeUnit.DAYS);
            }
        }
        return startStatus;
    }