TF-IDF(term frequency–inverse document frequency)是一种用于信息检索与数据挖掘的常用加权技术。TF意思是词频(Term Frequency),IDF意思是逆向文件频率(Inverse Document Frequency)。
思想:对文本进行分词,然后用tfidf算法得到文本对应的词向量,然后利用余弦算法求相似度
需要的jar :je-analysis-1.5.3.jar ,lucene-core-2.4.1.jar(高于4的版本会有冲突)
/**
* 直接匹配2个文本
*
* @author rock
*
*/
public class GetText {
private static List<String> fileList = new ArrayList<String>();
private static HashMap<String, HashMap<String, Double>> allTheTf = new HashMap<String, HashMap<String, Double>>();
private static HashMap<String, HashMap<String, Integer>> allTheNormalTF = new HashMap<String, HashMap<String, Integer>>();
private static LinkedHashMap<String, Double[]> vectorMap = new LinkedHashMap<String, Double[]>();
/**
* 分词
*
* @author create by rock
*/
public static String[] TextcutWord(String text) throws IOException {
String[] cutWordResult = null;
MMAnalyzer analyzer = new MMAnalyzer();
String tempCutWordResult = analyzer.segment(text, " ");
cutWordResult = tempCutWordResult.split(" ");
return cutWordResult;
}
public static Map<String, HashMap<String, Integer>> NormalTFOfAll(String key1, String key2, String text1,
String text2) throws IOException {
if (allTheNormalTF.get(key1) == null) {
HashMap<String, Integer> dict1 = new HashMap<String, Integer>();
dict1 = normalTF(TextcutWord(text1));
allTheNormalTF.put(key1, dict1);
}
if (allTheNormalTF.get(key2) == null) {
HashMap<String, Integer> dict2 = new HashMap<String, Integer>();
dict2 = normalTF(TextcutWord(text2));
allTheNormalTF.put(key2, dict2);
}
return allTheNormalTF;
}
public static Map<String, HashMap<String, Double>> tfOfAll(String key1, String key2, String text1, String text2)
throws IOException {
allTheTf.clear();
HashMap<String, Double> dict1 = new HashMap<String, Double>();
HashMap<String, Double> dict2 = new HashMap<String, Double>();
dict1 = tf(TextcutWord(text1));
dict2 = tf(TextcutWord(text2));
allTheTf.put(key1, dict1);
allTheTf.put(key2, dict2);
return allTheTf;
}
/**
* 计算词频
*
* @author create by rock
*/
public static HashMap<String, Double> tf(String[] cutWordResult) {
HashMap<String, Double> tf = new HashMap<String, Double>();// 正规化
int wordNum = cutWordResult.length;
int wordtf = 0;
for (int i = 0; i < wordNum; i++) {
wordtf = 0;
if (cutWordResult[i] != " ") {
for (int j = 0; j < wordNum; j++) {
if (i != j) {
if (cutWordResult[i].equals(cutWordResult[j])) {
cutWordResult[j] = " ";
wordtf++;
}
}
}
tf.put(cutWordResult[i], (new Double(++wordtf)) / wordNum);
cutWordResult[i] =