import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.lang.Math;;
public class func2 {
public static void main(String[] args) throws Exception {
//读取文档内容
char[] c2=new char[1000];
FileReader fr2=new FileReader("C://Users//Administrator//Desktop//作业1//文本分类.txt");
int num2=fr2.read(c2);
String text=new String(c2,0,num2);
text=text.toLowerCase();
//将文档内容按行存入数组
String[] str0=new String[1000];
str0=text.split("\n");
//设置分词器对象及分词方式
System.out.println("…………正在分词…………");
IKAnalyzer analyzer = new IKAnalyzer();
analyzer.setUseSmart(true);
int i,j;
fr2.close();
//对每一行执行分词操作并覆盖原行
for(i=0;i<str0.length;i++)
str0[i]=AnalysisResult(analyzer,str0[i]);
//输出分词结果并将分词结果存入二维数组
String[][] str=new String[13][];
for(i=0;i<str0.length;i++){
System.out.println(str0[i]);
str[i]=str0[i].split(" ");
}
//计算TF-IDF
System.out.println("…………正在计算TF-IDF…………");
func1 obj=new func1();
int[][] tf=new int[13][];
int[][] include=new int[13][];
double[][] idf=new double[13][];
double[][] tfidf=new double[13][];
for(i=0;i<str.length;i++){
for(j=0;j<str[i].length;j++){
//计算词频(TF)
tf[i]=new int[str[i].length];
tf[i][j]=obj.count(text,str[i][j]);
//计算逆文档频率(IDF)
include[i]=new int[str[i].length];
include[i][j]=CountInclude(str0,str[i][j]);
idf[i]=new double[str[i].length];
idf[i][j]=Math.log10(13.0/include[i][j]);
//计算TF-IDF
tfidf[i]=new double[str[i].length];
tfidf[i][j]=tf[i][j]*idf[i][j];
System.out.print(str[i][j]+":"+String.format("%.4f",tfidf[i][j])+"\t");
}
System.out.print("\n");
}
}
public static String AnalysisResult(Analyzer analyzer, String keyWord)
throws Exception {
char[] c=new char[1000];
String str="";
TokenStream tokenStream = analyzer.tokenStream("content",
new StringReader(keyWord));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
CharTermAttribute charTermAttribute = tokenStream
.getAttribute(CharTermAttribute.class);
str+=(charTermAttribute.toString()+" ");
}
return str;
}
public static int CountInclude(String[] s,String word){
int num=0,i;
for(i=0;i<s.length;i++){
if(s[i].indexOf(word)>=0){
num++;
continue;
}else continue;
}
return num;
}
}
文本分词与TF-IDF
最新推荐文章于 2021-06-22 20:33:35 发布