java 实现 计算tfidf 使用ik分词
Doc类,即文档
package pojo;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* Class description
*
*
* @version Enter version here..., 17/09/04
* @author Enter your name here...
*/
public class Doc {
/** 是否已分词标记 */
private boolean isSeg = false;
/** 标题 */
private String title;
/** url */
private String url;
/** 文档内容 */
private String content;
/** 文档编号 */
private String docno;
/** 分词后词项列表 */
private List<String> segs;
/** 词项计数Map */
private Map<String, Integer> segMap;
/**
* Constructs ...
*
*/
public Doc() {}
/**
* Constructs ...
*
*
* @param title
* @param url
* @param content 文档内容
* @param docno 文档编号
*/
public Doc(String url, String docno, String title, String content) {
this.title = title;
this.url = url;
this.content = content;
this.docno = docno;
}
/**
* 分词
* 对文档内容进行分词,使用ik分词器
*
*/
public void seg() {
if (isSeg()){//判断是否已分词
System.out.println("已分词");
return;
}
Reader reader = new StringReader(content);
IKSegmenter ikSegmenter = new IKSegmenter(reader, true);
Lexeme lexeme = null;
segs = new ArrayList<String>();
segMap = new HashMap<String, Integer>();
try {
while ((lexeme = ikSegmenter.next()) != null) {
System.out.println(lexeme.getLexemeText());
segs.add(lexeme.getLexemeText());
if (segMap.containsKey(lexeme.getLexemeText())) {
segMap.put(lexeme.getLexemeText(), segMap.get(lexeme.getLexemeText()) + 1);
} else {
segMap.put(lexeme.getLexemeText(), 1);
}
}
isSeg = true;
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public String toString() {
return "Doc{" +
"isSeg=" + isSeg +
", title='" + title + '\'' +
", url='" + url + '\'' +
", content='" + content + '\'' +
", docno='" + docno + '\'' +
", segs=" + segs +
", segMap=" + segMap +
'}';
}
/**
* Method description
*
*
* @return
*/
public String getContent() {
return content;
}
/**
* Method description
*
*
* @return
*/
public String getDocno() {
return docno;
}
/**
* 判断是否已分词
*
* @return
*/
public boolean isSeg() {
return isSeg;
}
/**
* Method description
*
*
* @return
*/
public Map<String, Integer> getSegMap() {
return segMap;
}
/**
* Method description
*
*
* @return
*/
public List<String> getSegs() {
return segs;
}
/**
* Method description
*
*
* @return
*/
public String getTitle() {
return title;
}
/**
* Method description
*
*
* @return
*/
public String getUrl() {
return url;
}
}
//~ Formatted by Jindent --- http://www.jindent.com
TfIdfUtil类,即计算tfidf工具类
package util;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import pojo.Doc;
/**
* 计算TfIdf
* 根据给定语料库(这里采用搜狗中文新闻开放预料库)
* 对文档集中各文档进行内容提取、分词
* 分别计算tf idf tfidf
* 并同时输出结果至控制台和目标文件
*
* @version v0.1, 17/09/04
* @author Kiwi Liu
*/
public class TfIdfUtil {
/** 文档集 */
private static List<Doc> docList = new ArrayList<Doc>();
/** 文档集中各文档的词项频率 */
private static Map<String, Map<String, Double>> docSetTfMap = new HashMap<String, Map<String, Double>>();
/** 文档集中各词项的文档频数 */
private static Map<String, Integer> docSetDcMap = new HashMap<String, Integer>();
/** 文档集中各词项的逆向文档频率 */
private static Map<String, Double> docSetIdfMap = new HashMap<String, Double>();
/**
* 根据语料库文件创建文档集并对每个文档进行分词和词项计数
* 这里利用正则表达式进行标题、url、标号、内容的抽取和去除标签
*
* @param corpusPath
*/
private static void createDocSet(String corpusPath) {
StringBuffer corpusBuffer = new StringBuffer();//语料库缓冲器
String line = null;
try {//读取语料库
BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(corpusPath)));
while ((line = bufferedReader.readLine()) != null) {
corpusBuffer.append(line);
} //while
//预处理部分
//提取各个文档
Pattern patternDoc = Pattern.compile("<doc>.*?</doc>");
Matcher matcherDoc = patternDoc.matcher(corpusBuffer.toString());
while (matcherDoc.find()) {
String title = null;
String url = null;
String docno = null;
String content = null;
String doc = matcherDoc.group();
//提取标题
Pattern patternTitle = Pattern.compile("<contenttitle>.*</contenttitle>");
Matcher matcherTitle = patternTitle.matcher(doc);
if (matcherTitle.find()) {
title = matcherTitle.group();
//去除标签
Pattern patternTag = Pattern.compile("<.*?>");
Matcher matcherTag = patternTag.matcher(title);
title = matcherTag.replaceAll(" ");
System.out.println("title: " + title);
}
//提取url
Pattern patternUrl = Pattern.compile("<url>.*?</url>");
Matcher matcherUrl = patternUrl.matcher(doc);
if (matcherUrl.find()) {
url = matcherUrl.group();
Pattern patternTag = Pattern.compile("<.*?>");
Matcher matcherTag = patternTag.matcher(url);
url = matcherTag.replaceAll("");
System.out.println("url: " + url);
}
//提取文档标号
Pattern patternDocno = Pattern.compile("<docno>.*?</docno>");
Matcher matcherDocno = patternDocno.matcher(doc);
if (matcherDocno.find()) {
docno = matcherDocno.group();
Pattern patternTag = Pattern.compile("<.*?>");
Matcher matcherTag = patternTag.matcher(docno);
docno = matcherTag.replaceAll("");
System.out.println("docno: " + docno);
}
//提取内容
Pattern patternContent = Pattern.compile("<content>.*?</content>");
Matcher matcherContent = patternContent.matcher(doc);
if (matcherContent.find()) {
content = matcherContent.group();
Pattern patternTag = Pattern.compile("<.*?>");
Matcher matcherTag = patternTag.matcher(content);
content = matcherTag.replaceAll("");
System.out.println("content: " + content);
}
//创建文档
Doc d = new Doc(url, docno, title, content);
//分词
d.seg();
docList.add(d);
} // while
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 计算文档集的逆向文档频率
*
*/
private static void idf() {
for (Doc doc : docList) {//统计各个词项的文档频数
for (String seg : doc.getSegMap().keySet()) {
if (docSetDcMap.containsKey(seg)) {
docSetDcMap.put(seg, docSetDcMap.get(seg) + 1);
} else {
docSetDcMap.put(seg, 1);
}
}
}
Set<String> segSet = docSetDcMap.keySet();
//计算idf=log(size(doc set)/docs(d,w))
for (String seg : segSet) {
docSetIdfMap.put(seg, Math.log((double) docList.size() / (double) docSetDcMap.get(seg)));
}
}
/**
* 输出结果信息到控制台和目标文件
*
* @param dstPath 目标文件路径
*/
private static void print(String dstPath) {
try {
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(dstPath)));
for (Doc doc : docList) {
System.out.println(doc);
bufferedWriter.write(doc.toString());
bufferedWriter.newLine();
for (String seg : doc.getSegMap().keySet()) {
double tf = docSetTfMap.get(doc.getDocno()).get(seg);
double idf = docSetIdfMap.get(seg);
String segInfo = seg + " tf: " + tf + " idf: " + idf + " tfidf: " + tf * idf;
System.out.println(segInfo);
bufferedWriter.write(segInfo);
bufferedWriter.newLine();
}
}
bufferedWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 计算文档集各文档中的词项频率
* tf=count(w,d)/size(d)
*/
private static void tf() {
for (Doc doc : docList) {//各文档
Map<String, Double> docTf = new HashMap<String, Double>();//单篇文档的词项频率
for (String seg : doc.getSegMap().keySet()) {//文档中各词项
docTf.put(seg, Double.valueOf((double) doc.getSegMap().get(seg) / (double) doc.getSegs().size()));
//tf=count(w,d)/size(d)
}
docSetTfMap.put(doc.getDocno(), docTf);
}
}
/**
* 计算tfidf
*
* @param corpusPath 语料库文件路径
* @param dstPath 目标文件路径
*/
public static void tfidf(String corpusPath, String dstPath) {
createDocSet(corpusPath);
tf();
idf();
print(dstPath);
}
}
//~ Formatted by Jindent --- http://www.jindent.com