基于庖丁分词的TFIDF计算

最新推荐文章于 2021-03-18 20:30:24 发布
jiutianhe
最新推荐文章于 2021-03-18 20:30:24 发布
阅读量1.6k
点赞数
分类专栏：机器学习
本文链接：https://blog.csdn.net/jiutianhe/article/details/41895401
版权
机器学习专栏收录该内容
23 篇文章 0 订阅
订阅专栏
通过近期研究测试，发现庖丁分词在中文分词中效果好一点，而TFIDF是词频计算中常用方法，关于TFIDF的计算过程就不详细说明了。
直接上代码：
package com.util;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class TFIDFMeasure {

	/**
	 * @param args
	 */
	private static List<String> FileList = new ArrayList<String>(); // the list of file

	// get list of file for the directory, including sub-directory of it
	public static List<String> readDirs(String filepath)
			throws FileNotFoundException, IOException {
		try {
			File file = new File(filepath);
			if (!file.isDirectory()) {
				System.out.println("输入的[]");
				System.out.println("filepath:" + file.getAbsolutePath());
			} else {
				String[] flist = file.list();
				for (int i = 0; i < flist.length; i++) {
					File newfile = new File(filepath + "\\" + flist[i]);
					if (!newfile.isDirectory()) {
						FileList.add(newfile.getAbsolutePath());
					} else if (newfile.isDirectory()) // if file is a directory, call ReadDirs
					{
						readDirs(filepath + "\\" + flist[i]);
					}
				}
			}
		} catch (FileNotFoundException e) {
			System.out.println(e.getMessage());
		}
		return FileList;
	}

	// read file
	public static String readFile(String file) throws FileNotFoundException,
			IOException {
		StringBuffer strSb = new StringBuffer(); // String is constant， StringBuffer can be changed.
		InputStreamReader inStrR = new InputStreamReader(new FileInputStream(
				file), "UTF-8"); // byte streams to character streams
		BufferedReader br = new BufferedReader(inStrR);
		String line = br.readLine();
		while (line != null) {
			strSb.append(line).append("\r\n");
			line = br.readLine();
		}
		br.close();
		return strSb.toString();
	}

	// word segmentation
	public static List<String> cutWords(String file) throws IOException {

		List<String> words = new ArrayList<String>();
		String text = TFIDFMeasure.readFile(file);		
//		IKAnalyzer analyzer = new IKAnalyzer();
//		words = analyzer.split(text);
		Analyzer analyzer = new PaodingAnalyzer(); //庖丁分词
		TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
		CharTermAttribute offAtt  = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
		while(ts.incrementToken()){
			words.add(offAtt.toString()); 
		}
		analyzer.close();
		return words;
	}

	// term frequency in a file, times for each word
	public static Map<String, Integer> normalTF(List<String> cutwords) {
		Map<String, Integer> resTF = new HashMap<String, Integer>();

		for (String word : cutwords) {
			if (resTF.get(word) == null) {
				resTF.put(word, 1);
//				System.out.println(word);
			} else {
				resTF.put(word, resTF.get(word) + 1);
//				System.out.println(word.toString());
			}
		}
		return resTF;
	}

	// term frequency in a file, frequency of each word
	public static Map<String, Double> tf(List<String> cutwords) {
		Map<String, Double> resTF = new HashMap<String, Double>();

		int wordLen = cutwords.size();
		Map<String, Integer> intTF = TFIDFMeasure.normalTF(cutwords);

		for (Entry<String, Integer> entry : intTF.entrySet()) {
			resTF.put(entry.getKey().toString(),
					Double.parseDouble(entry.getValue().toString()) / wordLen);
//			System.out.println(entry.getKey().toString() + " = "
//					+ Double.parseDouble(entry.getValue().toString()) / wordLen);
		}
		return resTF;
	}

	// tf times for file
	public static Map<String, Map<String, Integer>> normalTFAllFiles(
			String dirc) throws IOException {
		Map<String, Map<String, Integer>> allNormalTF = new HashMap<String, Map<String, Integer>>();

		List<String> filelist = TFIDFMeasure.readDirs(dirc);
		for (String file : filelist) {
			Map<String, Integer> dict = new HashMap<String, Integer>();
			List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut word for one file

			dict = TFIDFMeasure.normalTF(cutwords);
			allNormalTF.put(file, dict);
		}
		return allNormalTF;
	}

	// tf for all file
	public static Map<String, Map<String, Double>> tfAllFiles(String dirc)
			throws IOException {
		Map<String, Map<String, Double>> allTF = new HashMap<String, Map<String, Double>>();
		List<String> filelist = TFIDFMeasure.readDirs(dirc);

		for (String file : filelist) {
			Map<String, Double> dict = new HashMap<String, Double>();
			List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut words for one file

			dict = TFIDFMeasure.tf(cutwords);
			allTF.put(file, dict);
		}
		return allTF;
	}

	public static Map<String, Double> idf(
			Map<String, Map<String, Double>> all_tf) {
		Map<String, Double> resIdf = new HashMap<String, Double>();
		Map<String, Integer> dict = new HashMap<String, Integer>();
		int docNum = FileList.size();

		for (int i = 0; i < docNum; i++) {
			Map<String, Double> temp = all_tf.get(FileList.get(i));
			for (Entry<String, Double> entry : temp.entrySet()) {
				String word = entry.getKey().toString();
				if (dict.get(word) == null) {
					dict.put(word, 1);
				} else {
					dict.put(word, dict.get(word) + 1);
				}
			}			
		}
		System.out.println("IDF for every word is:");
		for (Entry<String, Integer> entry : dict.entrySet()) {
			double value = (float) Math.log(docNum
					/ Double.parseDouble(entry.getValue().toString()));
			resIdf.put(entry.getKey().toString(), value);
//			System.out.println(entry.getKey().toString() + " = " + value);
		}
	
		return resIdf;
	}

	public static Map<String, Map<String, Double>> tf_idf(Map<String, Map<String, Double>> all_tf,
			Map<String, Double> idfs) {
		Map<String, Map<String, Double>> resTfIdf = new HashMap<String, Map<String, Double>>();

		int docNum = FileList.size();
		for (int i = 0; i < docNum; i++) {
			String filepath = FileList.get(i);
			Map<String, Double> tfidf = new HashMap<String, Double>();
			Map<String, Double> temp = all_tf.get(filepath);
			for (Entry<String, Double> entry : temp.entrySet()) {
				String word = entry.getKey().toString();
				Double value = (float) Double.parseDouble(entry.getValue()
						.toString()) * idfs.get(word);
				tfidf.put(word, value);
			}
			
			resTfIdf.put(filepath, tfidf);
		}
	
		return resTfIdf;
	}

	public static void disTfIdf(Map<String, Map<String, Double>> tfidf) throws IOException {
		 
		System.out.println("TF-IDF for Every file is :");
		Map<String,Double> resultMap = new HashMap<String,Double>();
		for (Entry<String, Map<String, Double>> entrys : tfidf.entrySet()) {
			System.out.println("FileName: " + entrys.getKey().toString());
			System.out.println("{");
			Map<String, Double> temp = (HashMap<String, Double>) entrys
					.getValue();
			ArrayList<Map.Entry<String, Double>> infoIds =  
	                new ArrayList<Map.Entry<String, Double>>(temp.entrySet());  
	        Collections.sort(infoIds, new Comparator<Map.Entry<String, Double>>() {     
	            public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {  
	                return o2.getValue().compareTo(o1.getValue());  
	            }  
	        }); 
			for (Entry<String, Double> entry : infoIds) {
				System.out.println(entry.getKey().toString() + " = "
						+ entry.getValue().toString() + ", ");
			}
			System.out.println("}");
			resultMap.put(entrys.getKey().toString(), infoIds.get(0).getValue()+infoIds.get(1).getValue());//存入高分中前两名的和
		}
		ArrayList<Map.Entry<String, Double>> infoIds2 =  
                new ArrayList<Map.Entry<String, Double>>(resultMap.entrySet()); 
		Collections.sort(infoIds2, new Comparator<Map.Entry<String, Double>>() {     
            public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {  
                return o2.getValue().compareTo(o1.getValue());  
            }  
        });
		for (Entry<String, Double> entry : infoIds2) {
			System.out.println(readFile(entry.getKey().toString()) + " = "
					+ entry.getValue().toString() + ", ");
		}
	}

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		String file = "DataMiningSample/text/test1";

		Map<String, Map<String, Double>> all_tf = tfAllFiles(file);
		System.out.println();
		Map<String, Double> idfs = idf(all_tf);
		System.out.println();
		Map<String, Map<String, Double>> tf_idf = tf_idf(all_tf, idfs);		
		disTfIdf(tf_idf);
	}

}
复制去Google翻译翻译结果
TFIDF