基于字的文本相似度算法——Jacard算法

最新推荐文章于 2023-12-17 17:10:08 发布

inrgihc

最新推荐文章于 2023-12-17 17:10:08 发布

阅读量3.6k

点赞数

分类专栏：综合文章标签：算法

本文链接：https://blog.csdn.net/inrgihc/article/details/52739959

版权

综合专栏收录该内容

18 篇文章

订阅专栏

本文介绍了一种基于字符级别的Jaccard文本相似度算法，并提供了C++和Java两种语言的具体实现。该算法通过计算两段文本中字符的交集与并集来确定它们之间的相似程度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、算法原理

基于字的文本相似度Jacard 算法的原理是：
（1）计算两个文本中字的交集
（2）计算两个文本中字的并集
（3）交集内的字的个数除以并集内的字的个数即为文本相似度值
（4)根据设置的阈值判断是否相似

二、算法的C++实现

这里引用的StringUtil.hpp文件引自：

https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp

/*
 * JaccardSimilarity.hpp
 *
 *  Created: 2016年10月2日
 *   Author: tang
 */

#ifndef SRC_JACCARD_SIMILARITY_HPP_
#define SRC_JACCARD_SIMILARITY_HPP_
#include <algorithm>
#include <iostream>
#include <vector>
#include <set>
#include "StringUtil.hpp"

using namespace std;

class JaccardSimilarity
{
public:
	
	JaccardSimilarity()
	{
	}

	double CalculateTextSimilarity(string &str1,string &str2)
	{
		vector<uint16_t> words_for_str1;
		vector<uint16_t> words_for_str2;
		vector<uint16_t>::iterator it;

		if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) || 
			!utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
		{
			cout<<"TransCode Error"<<endl;
			return 0.;
		}

		for(it=words_for_str1.begin();it!=words_for_str1.end();)
		{
			if(codeFilter(*it))
			{
				++it;
			}
			else
			{
				it=words_for_str1.erase(it);
			}
		}

		for(it=words_for_str2.begin();it!=words_for_str2.end();)
                {
                        if(codeFilter(*it))
                        {
				++it;
                        }
			else
			{
				it=words_for_str2.erase(it);
			}
                }

		if(words_for_str1.size()+words_for_str2.size()<1)
			return 1.;

		vector<uint16_t> words_intersection;
		vector<uint16_t> words_union;
		std::sort(words_for_str1.begin(),words_for_str1.end());
		std::sort(words_for_str2.begin(),words_for_str2.end());
		std::set_intersection(words_for_str1.begin(),words_for_str1.end(),
					words_for_str2.begin(),words_for_str2.end(),
					std::inserter(words_intersection,words_intersection.begin()));

		std::set_union(words_for_str1.begin(),words_for_str1.end(),
                                        words_for_str2.begin(),words_for_str2.end(),
					std::inserter(words_union,words_union.begin()));

		double inter=words_intersection.size();
		double wunion=words_union.size();

		return inter/wunion;
	}

	bool codeFilter(int code) 
	{
        	if ((code < 0x4e00 || code > 0x9fa5) && 
			!(code >= '0' && code <= '9') && 
			!(code >= 'a' && code <= 'z') && 
			!(code >= 'A' && code <= 'Z'))
           		 return false;
        
        	return true;
	}

};

#endif /* SRC_JACCARD_SIMILARITY_HPP_ */

三、算法的java实现

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


public class JaccardSimilarity{

    public JaccardSimilarity() {
    }
    
    public boolean codeFilter(int code) {
        if ((code < 19968 || code > 40869) 
		&& !(code >= '0' && code <= '9') 
		&& !(code >= 'a' && code <= 'z') 
		&& !(code >= 'A' && code <= 'Z')) {
            return false;
        }
        return true;
    }

    public double CalculateTextSim(String content, String compareContent) {
        if(null == content || null == compareContent)
            return 0.0;
        Map<String, Integer> cntMap = new HashMap<String, Integer>();
        Set<String> cntSet = new HashSet<String>();
        Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();
        Set<String> cmpCntSet = new HashSet<String>();
        
	for (int i = 0; i != content.length(); i++) {
            int k = 0;
            if (codeFilter(content.codePointAt(i))) {
                if (cntMap.containsKey("" + content.charAt(i))) {
                    Integer count = cntMap.get("" + content.charAt(i));
                    count = count + 1;
                    cntMap.put("" + content.charAt(i), count);
                    k = count;
                } else {
                    cntMap.put("" + content.charAt(i), new Integer(1));
                    k = 1;
                }
                String tmpString = content.charAt(i) + "" + k;
                cntSet.add(tmpString);
            }
        }

        for (int i = 0; i != compareContent.length(); i++) {
            int k = 0;
            if (codeFilter(compareContent.codePointAt(i))) {
                if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {
                    Integer count = cmpCntMap.get("" + compareContent.charAt(i));
                    count = count + 1;
                    cmpCntMap.put("" + compareContent.charAt(i), count);
                    k = count;
                } else {
                    cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));
                    k = 1;
                }

                String tmpString = compareContent.charAt(i) + "" + k;
                cmpCntSet.add(tmpString);
            }
        }

        Set<String> tmpSet = new HashSet<String>();
        tmpSet.addAll(cntSet);
        cntSet.retainAll(cmpCntSet);
        double intCount = cntSet.size();

        tmpSet.addAll(cmpCntSet);


        if (tmpSet.size() == 0)
            return 0;
        double uniCount = tmpSet.size();


        return intCount / uniCount;
    }

}