TFIDF算法,java代码实现

TFIDF算法,java代码实现

TF-IDF算法全称为term frequency–inverse document frequency。TF就是term frequency的缩写,意为词频。IDF则是inverse document frequency的缩写,意为逆文档频率。
这里写图片描述

package com.aawant.nlp.featureExtraction.tfidf;

import java.io.BufferedReader;
import java.io.File;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;


public class TFIDF {
    /**
     * 
     * @param p_path1 文章路径,文件格式  
     * @param p_path2 文档路径
     * @return 词,TFIDF值
     * @throws Exception
     */
    public static Map<?, ?> getTFIDT(String p_path1, String p_path2)
            throws Exception {
        File file1 = new File(p_path1); 
        File file2 = new File(p_path2); 
        Map<String, Double> term_freq_map = new HashMap<String, Double>();// 词,该词在该文档中出现的频率
        Map<String, Double> doc_freq_map = new HashMap<String, Double>(); // 词,文档频率
        int doc_numbers_total = 0; // 总的文档数目
        Double doc_numbers_term = 0.0;// 包含该词条的文档数目
        //词频
        BufferedReader br_doc = null;
        br_doc = StringUtils.getBr(file1, "UTF-8");
        String line = "";
        while ((line = br_doc.readLine()) != null) {
            String[] temp_words = line.split(" ");
            for (String w : temp_words) {
                if (term_freq_map.containsKey(w)) {
                    Double value = term_freq_map.get(w) + 1.0;
                    term_freq_map.put(w, value);
                } else {
                    term_freq_map.put(w, 1.0);
                }
            }
        }
        br_doc.close();
        // 文档频率
        Set<String> set = term_freq_map.keySet();
        Iterator<String> iter = set.iterator();
        File[] files = file2.listFiles();
        doc_numbers_total = files.length;
        BufferedReader br_file = null;
        String temp2 = "";
        while (iter.hasNext()) {
            temp2 = (String) iter.next();
            String temp = "";
            for (File f : files) {
                StringBuilder sb = new StringBuilder();
                br_file = StringUtils.getBr(f, "utf-8");
                while ((temp = br_file.readLine()) != null) {
                    String words = temp;
                    sb.append(words);
                }
                String words_adoc = sb.toString();
                if (words_adoc.contains(temp2)) {
                    doc_numbers_term += 1.0;
                    doc_freq_map.put(temp2, doc_numbers_term);
                }
                br_file.close();
            }
            doc_numbers_term = 0.0;
        }
        /*
         * 计算TFIDF值
         * w = 词频*log(总的文档数目/文档频率)
         */
        Map<String, Double> re_map = new HashMap<String, Double>();
        String term = "";
        Set<String> set_t = term_freq_map.keySet();
        Iterator<String> iter_t = set_t.iterator();
        while(iter_t.hasNext()){
            term = (String) iter_t.next();
            Double A = term_freq_map.get(term);
            Double B = doc_freq_map.get(term);
            Double result = A*Math.log(doc_numbers_total/B)/Math.log(2.0);
            re_map.put(term, result);
        }   
        return re_map;
    }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值