帮人写了一个简单的小程序,用于计算两个文本字符串的相似度。计算方式非常简单,使用Lucene提供的StandardAnalyzer分词器将两个字符串分别分词、去除停用词、词干归一化,然后统计第一个文本中的词项在第二个文本中出现的次数,相似度 = 重复词项个数 / 第一个文本词项总数。原理非常简单,类似词袋模型,但是有些用处。
代码如下:
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/**
*
* @author zhangxichuan
*
*/
public class SimCalculator {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
/**
*
* @param content1
* @param content2
* @return
*/
public double calculate(String content1, String content2) {
List<String> tokenStream1 = getTokenizedList(content1);
List<String> tokenStream2 = getTokenizedList(content2);
if (isEmpty(tokenStream1) || isEmpty(tokenStream2)) {
return 0d;
}
Set<String> result2Set = new HashSet<String>(tokenStream2.size() / 4 * 3);
for (String token : tokenStream2) {
result2Set.add(token);
}
double simCount = 0d;
for (String token : tokenStream1) {
if (result2Set.contains(token)) {
simCount++;
}
}
return simCount / tokenStream1.size();
}
private static boolean isEmpty(Collection<?> c) {
if ( c == null || c.isEmpty() ) {
return true;
}
return false;
}
private List<String> getTokenizedList(String content) {
List<String> result = new ArrayList<String>();
TokenStream stream = analyzer.tokenStream(content, new StringReader(content));
stream = new PorterStemFilter(stream);
CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
try {
stream.reset();
while(stream.incrementToken()) {
String term = charTermAttribute.toString();
result.add(term);
}
}
catch(IOException e) {
// not thrown b/c we're using a string reader...
}
return result;
}
public static void main(String[] args) {
String[] str = new String[10];
str[0] = "Indexing Relational Databases Content Offline for Efficient Keyword-Based Search.";
str[1] = "Efficient IR-style Keyword Search over Relational Database.";
System.out.println( new SimCalculator().calculate(str[0], str[1]) );
}
}