一、算法原理
基于字的文本相似度Jacard 算法的原理是:
(1)计算两个文本中字的交集
(2)计算两个文本中字的并集
(3)交集内的字的个数除以并集内的字的个数即为文本相似度值
(4)根据设置的阈值判断是否相似
二、算法的C++实现
这里引用的StringUtil.hpp文件引自:
https://github.com/yanyiwu/cppjieba/blob/master/deps/limonp/StringUtil.hpp
/*
* JaccardSimilarity.hpp
*
* Created: 2016年10月2日
* Author: tang
*/
#ifndef SRC_JACCARD_SIMILARITY_HPP_
#define SRC_JACCARD_SIMILARITY_HPP_
#include <algorithm>
#include <iostream>
#include <vector>
#include <set>
#include "StringUtil.hpp"
using namespace std;
class JaccardSimilarity
{
public:
JaccardSimilarity()
{
}
double CalculateTextSimilarity(string &str1,string &str2)
{
vector<uint16_t> words_for_str1;
vector<uint16_t> words_for_str2;
vector<uint16_t>::iterator it;
if(!utf8ToUnicode< vector<uint16_t> >(str1,words_for_str1) ||
!utf8ToUnicode< vector<uint16_t> >(str2,words_for_str2 ) )
{
cout<<"TransCode Error"<<endl;
return 0.;
}
for(it=words_for_str1.begin();it!=words_for_str1.end();)
{
if(codeFilter(*it))
{
++it;
}
else
{
it=words_for_str1.erase(it);
}
}
for(it=words_for_str2.begin();it!=words_for_str2.end();)
{
if(codeFilter(*it))
{
++it;
}
else
{
it=words_for_str2.erase(it);
}
}
if(words_for_str1.size()+words_for_str2.size()<1)
return 1.;
vector<uint16_t> words_intersection;
vector<uint16_t> words_union;
std::sort(words_for_str1.begin(),words_for_str1.end());
std::sort(words_for_str2.begin(),words_for_str2.end());
std::set_intersection(words_for_str1.begin(),words_for_str1.end(),
words_for_str2.begin(),words_for_str2.end(),
std::inserter(words_intersection,words_intersection.begin()));
std::set_union(words_for_str1.begin(),words_for_str1.end(),
words_for_str2.begin(),words_for_str2.end(),
std::inserter(words_union,words_union.begin()));
double inter=words_intersection.size();
double wunion=words_union.size();
return inter/wunion;
}
bool codeFilter(int code)
{
if ((code < 0x4e00 || code > 0x9fa5) &&
!(code >= '0' && code <= '9') &&
!(code >= 'a' && code <= 'z') &&
!(code >= 'A' && code <= 'Z'))
return false;
return true;
}
};
#endif /* SRC_JACCARD_SIMILARITY_HPP_ */
三、算法的java实现
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class JaccardSimilarity{
public JaccardSimilarity() {
}
public boolean codeFilter(int code) {
if ((code < 19968 || code > 40869)
&& !(code >= '0' && code <= '9')
&& !(code >= 'a' && code <= 'z')
&& !(code >= 'A' && code <= 'Z')) {
return false;
}
return true;
}
public double CalculateTextSim(String content, String compareContent) {
if(null == content || null == compareContent)
return 0.0;
Map<String, Integer> cntMap = new HashMap<String, Integer>();
Set<String> cntSet = new HashSet<String>();
Map<String, Integer> cmpCntMap = new HashMap<String, Integer>();
Set<String> cmpCntSet = new HashSet<String>();
for (int i = 0; i != content.length(); i++) {
int k = 0;
if (codeFilter(content.codePointAt(i))) {
if (cntMap.containsKey("" + content.charAt(i))) {
Integer count = cntMap.get("" + content.charAt(i));
count = count + 1;
cntMap.put("" + content.charAt(i), count);
k = count;
} else {
cntMap.put("" + content.charAt(i), new Integer(1));
k = 1;
}
String tmpString = content.charAt(i) + "" + k;
cntSet.add(tmpString);
}
}
for (int i = 0; i != compareContent.length(); i++) {
int k = 0;
if (codeFilter(compareContent.codePointAt(i))) {
if (cmpCntMap.containsKey("" + compareContent.charAt(i))) {
Integer count = cmpCntMap.get("" + compareContent.charAt(i));
count = count + 1;
cmpCntMap.put("" + compareContent.charAt(i), count);
k = count;
} else {
cmpCntMap.put("" + compareContent.charAt(i), new Integer(1));
k = 1;
}
String tmpString = compareContent.charAt(i) + "" + k;
cmpCntSet.add(tmpString);
}
}
Set<String> tmpSet = new HashSet<String>();
tmpSet.addAll(cntSet);
cntSet.retainAll(cmpCntSet);
double intCount = cntSet.size();
tmpSet.addAll(cmpCntSet);
if (tmpSet.size() == 0)
return 0;
double uniCount = tmpSet.size();
return intCount / uniCount;
}
}