基于庖丁分词的TFIDF计算

通过近期研究测试,发现庖丁分词在中文分词中效果好一点,而TFIDF是词频计算中常用方法,关于TFIDF的计算过程就不详细说明了。

直接上代码:

package com.util;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

public class TFIDFMeasure {

	/**
	 * @param args
	 */
	private static List<String> FileList = new ArrayList<String>(); // the list of file

	// get list of file for the directory, including sub-directory of it
	public static List<String> readDirs(String filepath)
			throws FileNotFoundException, IOException {
		try {
			File file = new File(filepath);
			if (!file.isDirectory()) {
				System.out.println("输入的[]");
				System.out.println("filepath:" + file.getAbsolutePath());
			} else {
				String[] flist = file.list();
				for (int i = 0; i < flist.length; i++) {
					File newfile = new File(filepath + "\\" + flist[i]);
					if (!newfile.isDirectory()) {
						FileList.add(newfile.getAbsolutePath());
					} else if (newfile.isDirectory()) // if file is a directory, call ReadDirs
					{
						readDirs(filepath + "\\" + flist[i]);
					}
				}
			}
		} catch (FileNotFoundException e) {
			System.out.println(e.getMessage());
		}
		return FileList;
	}

	// read file
	public static String readFile(String file) throws FileNotFoundException,
			IOException {
		StringBuffer strSb = new StringBuffer(); // String is constant, StringBuffer can be changed.
		InputStreamReader inStrR = new InputStreamReader(new FileInputStream(
				file), "UTF-8"); // byte streams to character streams
		BufferedReader br = new BufferedReader(inStrR);
		String line = br.readLine();
		while (line != null) {
			strSb.append(line).append("\r\n");
			line = br.readLine();
		}
		br.close();
		return strSb.toString();
	}

	// word segmentation
	public static List<String> cutWords(String file) throws IOException {

		List<String> words = new ArrayList<String>();
		String text = TFIDFMeasure.readFile(file);		
//		IKAnalyzer analyzer = new IKAnalyzer();
//		words = analyzer.split(text);
		Analyzer analyzer = new PaodingAnalyzer(); //庖丁分词
		TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
		CharTermAttribute offAtt  = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
		while(ts.incrementToken()){
			words.add(offAtt.toString()); 
		}
		analyzer.close();
		return words;
	}

	// term frequency in a file, times for each word
	public static Map<String, Integer> normalTF(List<String> cutwords) {
		Map<String, Integer> resTF = new HashMap<String, Integer>();

		for (String word : cutwords) {
			if (resTF.get(word) == null) {
				resTF.put(word, 1);
//				System.out.println(word);
			} else {
				resTF.put(word, resTF.get(word) + 1);
//				System.out.println(word.toString());
			}
		}
		return resTF;
	}

	// term frequency in a file, frequency of each word
	public static Map<String, Double> tf(List<String> cutwords) {
		Map<String, Double> resTF = new HashMap<String, Double>();

		int wordLen = cutwords.size();
		Map<String, Integer> intTF = TFIDFMeasure.normalTF(cutwords);

		for (Entry<String, Integer> entry : intTF.entrySet()) {
			resTF.put(entry.getKey().toString(),
					Double.parseDouble(entry.getValue().toString()) / wordLen);
//			System.out.println(entry.getKey().toString() + " = "
//					+ Double.parseDouble(entry.getValue().toString()) / wordLen);
		}
		return resTF;
	}

	// tf times for file
	public static Map<String, Map<String, Integer>> normalTFAllFiles(
			String dirc) throws IOException {
		Map<String, Map<String, Integer>> allNormalTF = new HashMap<String, Map<String, Integer>>();

		List<String> filelist = TFIDFMeasure.readDirs(dirc);
		for (String file : filelist) {
			Map<String, Integer> dict = new HashMap<String, Integer>();
			List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut word for one file

			dict = TFIDFMeasure.normalTF(cutwords);
			allNormalTF.put(file, dict);
		}
		return allNormalTF;
	}

	// tf for all file
	public static Map<String, Map<String, Double>> tfAllFiles(String dirc)
			throws IOException {
		Map<String, Map<String, Double>> allTF = new HashMap<String, Map<String, Double>>();
		List<String> filelist = TFIDFMeasure.readDirs(dirc);

		for (String file : filelist) {
			Map<String, Double> dict = new HashMap<String, Double>();
			List<String> cutwords = TFIDFMeasure.cutWords(file); // get cut words for one file

			dict = TFIDFMeasure.tf(cutwords);
			allTF.put(file, dict);
		}
		return allTF;
	}

	public static Map<String, Double> idf(
			Map<String, Map<String, Double>> all_tf) {
		Map<String, Double> resIdf = new HashMap<String, Double>();
		Map<String, Integer> dict = new HashMap<String, Integer>();
		int docNum = FileList.size();

		for (int i = 0; i < docNum; i++) {
			Map<String, Double> temp = all_tf.get(FileList.get(i));
			for (Entry<String, Double> entry : temp.entrySet()) {
				String word = entry.getKey().toString();
				if (dict.get(word) == null) {
					dict.put(word, 1);
				} else {
					dict.put(word, dict.get(word) + 1);
				}
			}			
		}
		System.out.println("IDF for every word is:");
		for (Entry<String, Integer> entry : dict.entrySet()) {
			double value = (float) Math.log(docNum
					/ Double.parseDouble(entry.getValue().toString()));
			resIdf.put(entry.getKey().toString(), value);
//			System.out.println(entry.getKey().toString() + " = " + value);
		}
	
		return resIdf;
	}

	public static Map<String, Map<String, Double>> tf_idf(Map<String, Map<String, Double>> all_tf,
			Map<String, Double> idfs) {
		Map<String, Map<String, Double>> resTfIdf = new HashMap<String, Map<String, Double>>();

		int docNum = FileList.size();
		for (int i = 0; i < docNum; i++) {
			String filepath = FileList.get(i);
			Map<String, Double> tfidf = new HashMap<String, Double>();
			Map<String, Double> temp = all_tf.get(filepath);
			for (Entry<String, Double> entry : temp.entrySet()) {
				String word = entry.getKey().toString();
				Double value = (float) Double.parseDouble(entry.getValue()
						.toString()) * idfs.get(word);
				tfidf.put(word, value);
			}
			
			resTfIdf.put(filepath, tfidf);
		}
	
		return resTfIdf;
	}

	public static void disTfIdf(Map<String, Map<String, Double>> tfidf) throws IOException {
		 
		System.out.println("TF-IDF for Every file is :");
		Map<String,Double> resultMap = new HashMap<String,Double>();
		for (Entry<String, Map<String, Double>> entrys : tfidf.entrySet()) {
			System.out.println("FileName: " + entrys.getKey().toString());
			System.out.println("{");
			Map<String, Double> temp = (HashMap<String, Double>) entrys
					.getValue();
			ArrayList<Map.Entry<String, Double>> infoIds =  
	                new ArrayList<Map.Entry<String, Double>>(temp.entrySet());  
	        Collections.sort(infoIds, new Comparator<Map.Entry<String, Double>>() {     
	            public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {  
	                return o2.getValue().compareTo(o1.getValue());  
	            }  
	        }); 
			for (Entry<String, Double> entry : infoIds) {
				System.out.println(entry.getKey().toString() + " = "
						+ entry.getValue().toString() + ", ");
			}
			System.out.println("}");
			resultMap.put(entrys.getKey().toString(), infoIds.get(0).getValue()+infoIds.get(1).getValue());//存入高分中前两名的和
		}
		ArrayList<Map.Entry<String, Double>> infoIds2 =  
                new ArrayList<Map.Entry<String, Double>>(resultMap.entrySet()); 
		Collections.sort(infoIds2, new Comparator<Map.Entry<String, Double>>() {     
            public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {  
                return o2.getValue().compareTo(o1.getValue());  
            }  
        });
		for (Entry<String, Double> entry : infoIds2) {
			System.out.println(readFile(entry.getKey().toString()) + " = "
					+ entry.getValue().toString() + ", ");
		}
	}

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		String file = "DataMiningSample/text/test1";

		Map<String, Map<String, Double>> all_tf = tfAllFiles(file);
		System.out.println();
		Map<String, Double> idfs = idf(all_tf);
		System.out.println();
		Map<String, Map<String, Double>> tf_idf = tf_idf(all_tf, idfs);		
		disTfIdf(tf_idf);
	}

}


namespace ServiceRanking { /// <summary> /// Summary description for TF_IDFLib. /// </summary> public class TFIDFMeasure { private string[] _docs; private string[][] _ngramDoc; private int _numDocs=0; private int _numTerms=0; private ArrayList _terms; private int[][] _termFreq; private float[][] _termWeight; private int[] _maxTermFreq; private int[] _docFreq; public class TermVector { public static float ComputeCosineSimilarity(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFER LENGTH"); float denom=(VectorLength(vector1) * VectorLength(vector2)); if (denom == 0F) return 0F; else return (InnerProduct(vector1, vector2) / denom); } public static float InnerProduct(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFFER LENGTH ARE NOT ALLOWED"); float result=0F; for (int i=0; i < vector1.Length; i++) result += vector1[i] * vector2[i]; return result; } public static float VectorLength(float[] vector) { float sum=0.0F; for (int i=0; i < vector.Length; i++) sum=sum + (vector[i] * vector[i]); return (float)Math.Sqrt(sum); } } private IDictionary _wordsIndex=new Hashtable() ; public TFIDFMeasure(string[] documents) { _docs=documents; _numDocs=documents.Length ; MyInit(); } private void GeneratNgramText() { } private ArrayList GenerateTerms(string[] docs) { ArrayList uniques=new ArrayList() ; _ngramDoc=new string[_numDocs][] ; for (int i=0; i < docs.Length ; i++) { Tokeniser tokenizer=new Tokeniser() ; string[] words=tokenizer.Partition(docs[i]); for (int j=0; j < words.Length ; j++) if (!uniques.Contains(words[j]) ) uniques.Add(words[j]) ; } return uniques; } private static object
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值