java tfidf 训练_TFIDF算法java实现 | 学步园

package tfidf;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import jeasy.analysis.MMAnalyzer;

public class ReadFiles {

private static List fileList = new ArrayList();

private static HashMap> allTheTf = new HashMap>();

private static HashMap> allTheNormalTF = new HashMap>();

public static List readDirs(String filepath) throws FileNotFoundException, IOException {

try {

File file = new File(filepath);

if (!file.isDirectory()) {

System.out.println("输入的参数应该为[文件夹名]");

System.out.println("filepath: " + file.getAbsolutePath());

} else if (file.isDirectory()) {

String[] filelist = file.list();

for (int i = 0; i < filelist.length; i++) {

File readfile = new File(filepath + "\\" + filelist[i]);

if (!readfile.isDirectory()) {

//System.out.println("filepath: " + readfile.getAbsolutePath());

fileList.add(readfile.getAbsolutePath());

} else if (readfile.isDirectory()) {

readDirs(filepath + "\\" + filelist[i]);

}

}

}

} catch (FileNotFoundException e) {

System.out.println(e.getMessage());

}

return fileList;

}

public static String readFiles(String file) throws FileNotFoundException, IOException {

StringBuffer sb = new StringBuffer();

InputStreamReader is = new InputStreamReader(new FileInputStream(file), "gbk");

BufferedReader br = new BufferedReader(is);

String line = br.readLine();

while (line != null) {

sb.append(line).append("\r\n");

line = br.readLine();

}

br.close();

return sb.toString();

}

public static String[] cutWord(String file) throws IOException {

String[] cutWordResult = null;

String text = ReadFiles.readFiles(file);

MMAnalyzer analyzer = new MMAnalyzer();

//System.out.println("file content: "+text);

//System.out.println("cutWordResult: "+analyzer.segment(text, " "));

String tempCutWordResult = analyzer.segment(text, " ");

cutWordResult = tempCutWordResult.split(" ");

return cutWordResult;

}

public static HashMap tf(String[] cutWordResult) {

HashMap tf = new HashMap();//正规化

int wordNum = cutWordResult.length;

int wordtf = 0;

for (int i = 0; i < wordNum; i++) {

wordtf = 0;

for (int j = 0; j < wordNum; j++) {

if (cutWordResult[i] != " " && i != j) {

if (cutWordResult[i].equals(cutWordResult[j])) {

cutWordResult[j] = " ";

wordtf++;

}

}

}

if (cutWordResult[i] != " ") {

tf.put(cutWordResult[i], (new Float(++wordtf)) / wordNum);

cutWordResult[i] = " ";

}

}

return tf;

}

public static HashMap normalTF(String[] cutWordResult) {

HashMap tfNormal = new HashMap();//没有正规化

int wordNum = cutWordResult.length;

int wordtf = 0;

for (int i = 0; i < wordNum; i++) {

wordtf = 0;

if (cutWordResult[i] != " ") {

for (int j = 0; j < wordNum; j++) {

if (i != j) {

if (cutWordResult[i].equals(cutWordResult[j])) {

cutWordResult[j] = " ";

wordtf++;

}

}

}

tfNormal.put(cutWordResult[i], ++wordtf);

cutWordResult[i] = " ";

}

}

return tfNormal;

}

public static Map> tfOfAll(String dir) throws IOException {

List fileList = ReadFiles.readDirs(dir);

for (String file : fileList) {

HashMap dict = new HashMap();

dict = ReadFiles.tf(ReadFiles.cutWord(file));

allTheTf.put(file, dict);

}

return allTheTf;

}

public static Map> NormalTFOfAll(String dir) throws IOException {

List fileList = ReadFiles.readDirs(dir);

for (int i = 0; i < fileList.size(); i++) {

HashMap dict = new HashMap();

dict = ReadFiles.normalTF(ReadFiles.cutWord(fileList.get(i)));

allTheNormalTF.put(fileList.get(i), dict);

}

return allTheNormalTF;

}

public static Map idf(String dir) throws FileNotFoundException, UnsupportedEncodingException, IOException {

//公式IDF=log((1+|D|)/|Dt|),其中|D|表示文档总数,|Dt|表示包含关键词t的文档数量。

Map idf = new HashMap();

List located = new ArrayList();

float Dt = 1;

float D = allTheNormalTF.size();//文档总数

List key = fileList;//存储各个文档名的List

Map> tfInIdf = allTheNormalTF;//存储各个文档tf的Map

for (int i = 0; i < D; i++) {

HashMap temp = tfInIdf.get(key.get(i));

for (String word : temp.keySet()) {

Dt = 1;

if (!(located.contains(word))) {

for (int k = 0; k < D; k++) {

if (k != i) {

HashMap temp2 = tfInIdf.get(key.get(k));

if (temp2.keySet().contains(word)) {

located.add(word);

Dt = Dt + 1;

continue;

}

}

}

idf.put(word, Log.log((1 + D) / Dt, 10));

}

}

}

return idf;

}

public static Map> tfidf(String dir) throws IOException {

Map idf = ReadFiles.idf(dir);

Map> tf = ReadFiles.tfOfAll(dir);

for (String file : tf.keySet()) {

Map singelFile = tf.get(file);

for (String word : singelFile.keySet()) {

singelFile.put(word, (idf.get(word)) * singelFile.get(word));

}

}

return tf;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值