使用余弦相似性原理计算文本的相似度

原理参考:[url]http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html[/url]

好多人说包不对,或者不知道哪儿下载,贴上个下载地址:[url]https://code.google.com/p/ik-analyzer/downloads/list[/url],附件贴上ik的包,其他的apache的commons包自己去下吧

/**
*
*/
package com.text;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

/**
* @author Riching
*
* @date 2013-8-10
*/
public class IKMainTest {

/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String str1 = "我喜欢看电视,不喜欢看电影。";
String str2 = "我不喜欢看电视,也不喜欢看电影。";
Map<String, Integer> tf1 = getTF(str1);
Map<String, Integer> tf2 = getTF(str2);
Map<String, MutablePair<Integer, Integer>> tfs = new HashMap<String, MutablePair<Integer, Integer>>();
for (String key : tf1.keySet()) {
MutablePair<Integer, Integer> pair = new MutablePair<Integer, Integer>(tf1.get(key), 0);
tfs.put(key, pair);
}
for (String key : tf2.keySet()) {
MutablePair<Integer, Integer> pair = tfs.get(key);
if (null == pair) {
pair = new MutablePair<Integer, Integer>(0, tf2.get(key));
} else {
pair.setRight(tf2.get(key));
}
}
double d = caclIDF(tfs);
System.out.println(d);
}

public static Map<String, Integer> getTF(String str) throws IOException {
Map<String, Integer> map = new HashMap<String, Integer>();
IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(str), true);
Lexeme lexeme = null;
while ((lexeme = ikSegmenter.next()) != null) {
String key = lexeme.getLexemeText();
Integer count = map.get(key);
if (null == count) {
count = 1;
} else {
count = count + 1;
}
map.put(key, count);
}
return map;
}

public static double caclIDF(Map<String, MutablePair<Integer, Integer>> tf) {
double d = 0;
if (MapUtils.isEmpty(tf)) {
return d;
}
double denominator = 0;
double sqdoc1 = 0;
double sqdoc2 = 0;
Pair<Integer, Integer> count = null;
for (String key : tf.keySet()) {
count = tf.get(key);
denominator += count.getLeft() * count.getRight();
sqdoc1 += count.getLeft() * count.getLeft();
sqdoc2 += count.getRight() * count.getRight();
}
d = denominator / (Math.sqrt(sqdoc1) * Math.sqrt(sqdoc2));
return d;
}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值