通过近期研究测试,发现庖丁分词在中文分词中效果好一点,而TFIDF是词频计算中常用方法,关于TFIDF的计算过程就不详细说明了。
直接上代码:
package com.util;
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TFIDFMeasure {
/**
* @param args
*/
private static List<String> FileList = new ArrayList<String>(); // the list of file
// get list of file for the directory, including sub-directory of it
public static List<String> readDirs(String filepath)
throws FileNotFoundException, IOException {
try {
File file = new File(filepath);
if (!file.isDirectory()) {
System.out.println("输入的[]");
System.out.println("filepath:" + file.getAbsolutePath());
} else {
String[] flist = file.list();
for (int i = 0; i < flist.length; i++) {
File newfile = new File(filepath + "\\" + flist[i]);
if (!newfile.isDirectory()) {
FileList.add(newfile.getAbsolutePath());
} else if (newfile.isDirectory()) // if file is a directory, call ReadDirs
{
readDirs(filepath + "\\" + flist[i]);
}
}
}
} catch (FileNotFoundException e) {
System.out.println(e.getMessage());
}
return FileList;
}
// read file
public static String readFile(String file) throws FileNotFoundException,
IOException {
StringBuffer strSb = new StringBuffer(); // String is constant, StringBuffer can be changed.
InputStreamReader inStrR = new InputStreamReader(new FileInputStream(
file), "UTF-8"); // byte streams to character streams
BufferedReader br = new BufferedReader(inStrR);
String line = br.readLine();
while (line != null) {
strSb.append(line).append("\r\n");
line = br.readLine();
}
br.close();
return strSb.toString();
}
// word segmentation
public static List<String> cutWords(String file) throws IOException {
List<String> words = new ArrayList<String>();
String text = TFIDFMeasure.readFile(file);
// IKAnalyzer analyzer = new IKAnalyzer();
// words = analyzer.split(text);
Analyzer analyzer = new PaodingAnalyzer(); //庖丁分词
TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
CharTermAttribute offAtt = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
while(ts.incrementToken()){
words.add(offAtt.toString());
}
analyzer.close();
return words;
}
// term frequency in a file, times for each word
public static Map<String, Integer> normalTF(List<String> cutwords) {
Map<String, Integer> resTF = new HashMap<String, Integer>();
for (String word : cutwords) {
if (resTF.get(word) == null) {
resTF.put(word, 1);
// System.out.println(word);
} else {
resTF.put(word, resTF.get(word) + 1);
// System.out.println(word.toString());
}
}
return resTF;
}
// term frequency in a file, frequency of each word
public static Map<String, Double> tf(List<String> cutwords) {
Map<String, Double> resTF = new HashMap<String, Double>();
int wordLen = cutwords.size();
Map<String, Integer> intTF = TFIDFMeasure.normalTF(cutwords);
for (Entry<String, Integer> entry : intTF.entrySet()) {
resTF.put(entry.getKey().toString(),
Double.parseDouble(entry.getValue().toString()) / wordLen);
// System.out.println(entry.getKey().toString() + " = "
// + Double.parseDouble(entry.getValue().toString()) / wordLen);
}
return resTF;
}
// tf times for file
public static Map<String, Map<String, Integer>> normalTFAllFiles(
String dirc) throws IOException {
Map<String, Map<String, Integer>> allNormalTF = new HashMap<String, Map<String, Integer>>();
List<String> filelist = TFIDFMeasure.readDirs(dirc);
for (String file : filelist) {
Map<String, Integer> dict = new HashMap<String, Integer>();
List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut word for one file
dict = TFIDFMeasure.normalTF(cutwords);
allNormalTF.put(file, dict);
}
return allNormalTF;
}
// tf for all file
public static Map<String, Map<String, Double>> tfAllFiles(String dirc)
throws IOException {
Map<String, Map<String, Double>> allTF = new HashMap<String, Map<String, Double>>();
List<String> filelist = TFIDFMeasure.readDirs(dirc);
for (String file : filelist) {
Map<String, Double> dict = new HashMap<String, Double>();
List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut words for one file
dict = TFIDFMeasure.tf(cutwords);
allTF.put(file, dict);
}
return allTF;
}
public static Map<String, Double> idf(
Map<String, Map<String, Double>> all_tf) {
Map<String, Double> resIdf = new HashMap<String, Double>();
Map<String, Integer> dict = new HashMap<String, Integer>();
int docNum = FileList.size();
for (int i = 0; i < docNum; i++) {
Map<String, Double> temp = all_tf.get(FileList.get(i));
for (Entry<String, Double> entry : temp.entrySet()) {
String word = entry.getKey().toString();
if (dict.get(word) == null) {
dict.put(word, 1);
} else {
dict.put(word, dict.get(word) + 1);
}
}
}
System.out.println("IDF for every word is:");
for (Entry<String, Integer> entry : dict.entrySet()) {
double value = (float) Math.log(docNum
/ Double.parseDouble(entry.getValue().toString()));
resIdf.put(entry.getKey().toString(), value);
// System.out.println(entry.getKey().toString() + " = " + value);
}
return resIdf;
}
public static Map<String, Map<String, Double>> tf_idf(Map<String, Map<String, Double>> all_tf,
Map<String, Double> idfs) {
Map<String, Map<String, Double>> resTfIdf = new HashMap<String, Map<String, Double>>();
int docNum = FileList.size();
for (int i = 0; i < docNum; i++) {
String filepath = FileList.get(i);
Map<String, Double> tfidf = new HashMap<String, Double>();
Map<String, Double> temp = all_tf.get(filepath);
for (Entry<String, Double> entry : temp.entrySet()) {
String word = entry.getKey().toString();
Double value = (float) Double.parseDouble(entry.getValue()
.toString()) * idfs.get(word);
tfidf.put(word, value);
}
resTfIdf.put(filepath, tfidf);
}
return resTfIdf;
}
public static void disTfIdf(Map<String, Map<String, Double>> tfidf) throws IOException {
System.out.println("TF-IDF for Every file is :");
Map<String,Double> resultMap = new HashMap<String,Double>();
for (Entry<String, Map<String, Double>> entrys : tfidf.entrySet()) {
System.out.println("FileName: " + entrys.getKey().toString());
System.out.println("{");
Map<String, Double> temp = (HashMap<String, Double>) entrys
.getValue();
ArrayList<Map.Entry<String, Double>> infoIds =
new ArrayList<Map.Entry<String, Double>>(temp.entrySet());
Collections.sort(infoIds, new Comparator<Map.Entry<String, Double>>() {
public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
for (Entry<String, Double> entry : infoIds) {
System.out.println(entry.getKey().toString() + " = "
+ entry.getValue().toString() + ", ");
}
System.out.println("}");
resultMap.put(entrys.getKey().toString(), infoIds.get(0).getValue()+infoIds.get(1).getValue());//存入高分中前两名的和
}
ArrayList<Map.Entry<String, Double>> infoIds2 =
new ArrayList<Map.Entry<String, Double>>(resultMap.entrySet());
Collections.sort(infoIds2, new Comparator<Map.Entry<String, Double>>() {
public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
for (Entry<String, Double> entry : infoIds2) {
System.out.println(readFile(entry.getKey().toString()) + " = "
+ entry.getValue().toString() + ", ");
}
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String file = "DataMiningSample/text/test1";
Map<String, Map<String, Double>> all_tf = tfAllFiles(file);
System.out.println();
Map<String, Double> idfs = idf(all_tf);
System.out.println();
Map<String, Map<String, Double>> tf_idf = tf_idf(all_tf, idfs);
disTfIdf(tf_idf);
}
}