1、在pom.xml中添加分词器与word读取依赖
<!-- ik.中文分词器依赖-->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<!-- lucene依赖 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.7.2</version>
</dependency>
<!--word读取-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.14-beta1</version>
</dependency>
2、jsp 使用
<button type="button" style="width: 8%;outline: none;margin-left: 1.5%;margin-bottom: 15px" onclick="ExerciseCheck()" class="btn btn-primary">作业查重</button>
<script type="text/javascript">
function ExerciseCheck() {
var CourseID = $("#ds_course").val();
var ChapterID = $("#ds_cnumber").val();
var MinChapterID = $("#ds_snumber").val();
$.ajax({
type:"POST",
url:"/exercise/ExerciseRecheck",
dataType:"json",
data:{
CourseID:CourseID,
ChapterID:ChapterID,
MinChapterID:MinChapterID
},
async:false,
success: function(data){
$("#listExport").html($("#listExport").html()
+ "<a style=\"width: 100%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px\" href=\"/excel/ListExports?CourseID="+CourseID+"&ChapterID="+ChapterID+"&MinChapterID="+MinChapterID+"\" class=\"btn btn-primary\">导出名单</a>"
);
openCheckWin();
document.getElementById('checkView').innerHTML = "";
if (data == null || data == ""){
$("#checkView").html($("#checkView").html()
+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
+ "<span>暂无内容</span>"
+ "</li>"
);
}else{
var json = eval(data);
$.each(json, function (index) {
var DetectionUserID = json[index].detectionUserID;
var DetectionUserName = json[index].detectionUserName;
var MeasuredUserID = json[index].measuredUserID;
var MeasuredUserName = json[index].measuredUserName;
var Similarity = json[index].similarity;
$("#checkView").html($("#checkView").html()
+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
+ "<span>" +"学号:"+ DetectionUserID + " " +"姓名:"+ DetectionUserName + "</span>"
+ "</li>"
);
});
}
}
});
}
function openCheckWin(){
document.getElementById("CheckWin").style.display = "block";
}
</script>
<div class="floatingWin" style="border-radius: 5px;margin-left: 28%;width: 40%;display: none;position: absolute;background: #FFFFFF;height: 450px;z-index: 111111111111111111111111111111" id="CheckWin">
<div id="listExport" style="width: 13%;float: left;margin-left: 1.5%">
</div>
<button type="button" style="width: 14%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px" onclick="closeCheckWin()" class="btn btn-primary">关闭</button>
<div class="form-group">
<span class="text-muted" style="margin-left: 1.5%">疑似抄袭名单</span>
<ul class="list-group" id="checkView" style="overflow: auto">
</ul>
</div>
</div>
3、controller
@ResponseBody
@RequestMapping("/ExerciseRecheck")
public List<ExerciseCheck> ExerciseRecheck(String CourseID,String ChapterID,String MinChapterID,HttpServletRequest request) throws Exception {
List<Exercise> list1 = exerciseService.QuerySectionExercise(CourseID,ChapterID,MinChapterID);
List<Exercise> list2 = list1;
List<ExerciseCheck> exerciseChecks = new ArrayList<ExerciseCheck>();
if(list1.size() < 2){
System.out.println("作业数小于2无法查重!");
}else {
int l = 0;
for(int i = 0;i < list1.size();i++){
String file1 = new WordRead().readWord(list1.get(i).getChapterExercise(),request).replaceAll("\r|\n", "");
for (int j = 0;j<list2.size();j++){
if( i != j){
String file2 = new WordRead().readWord(list2.get(j).getChapterExercise(),request).replaceAll("\r|\n", "");
Double f = new CosineSimilarAlgorithm().cosSimilarityByString(file1,file2);
if(f > 0.6){
ExerciseCheck ec = new ExerciseCheck();
ec.setDetectionUserID(list1.get(i).getUserID());
ec.setDetectionUserName(list1.get(i).getUserName());
ec.setMeasuredUserID(list2.get(j).getUserID());
ec.setMeasuredUserName(list2.get(j).getUserName());
ec.setSimilarity(f.toString());
exerciseChecks.add(l,ec);
l++;
continue;
}
}
}
}
}
return exerciseChecks;
}
4、读取word文件内容
package com.graduation.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import java.util.UUID;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
import javax.servlet.http.HttpServletRequest;
public class WordRead {
public static String readWord(String filename,HttpServletRequest request) throws Exception{
String path = request.getServletContext().getRealPath("");
System.out.println(path);
String FilePath=path + "\\static\\exercises\\";//从我们的上传文件夹中去取
String BASE_PATH = FilePath;
filename = filename+".doc";
File file = new File(BASE_PATH + filename);
System.out.println(BASE_PATH + filename);
HWPFDocument doc = new HWPFDocument(new FileInputStream(file));
//通过 Doc对象直接获取Text
StringBuilder sb = doc.getText();
// System.out.println("文本:"+sb.toString());
//通过Range对象获取Text
Range range = doc.getRange();
String text = range.text();
// System.out.println(text);
//获取段落数目
//在Word中,一个回车符就是一个段落了
int nums = range.numParagraphs();
// System.out.println("多少个段落:"+nums);
//获取doc中的图片数
List<Picture> pics = doc.getPicturesTable().getAllPictures();
for(Picture pic:pics){
//图片在doc文件中的位置,分析Doc 转化成其他文本时需要用到
int start = pic.getStartOffset();
int width = pic.getWidth();
int height = pic.getHeight();
String mimeType = pic.getMimeType();
System.out.printf("开始位置%d\t图片大小度%d,高%d,\t图片类型%s\r\n",start,width,height,mimeType);
}
//1.通过Picture的writeImageContent方法 写文件
//2.获取Picture的byte 自己写
copyPic2Disk(pics, new File(BASE_PATH));
//遍历range范围内的table。
TableIterator tableIter = new TableIterator(range);
while (tableIter.hasNext()) {
Table table = tableIter.next();
//开始位置
int start = table.getStartOffset();
//结束位置
int end = table.getEndOffset();
System.out.printf("开始位置%d,结束为止%d\r\n",start,end);
//获取行的数目
int rowNum = table.numRows();
for (int j = 0; j < rowNum; j++) {
//获取每一行
TableRow row = table.getRow(j);
int cellNum = row.numCells();
for (int k = 0; k < cellNum; k++) {
//获取每一列
TableCell cell = row.getCell(k);
// 输出单元格的文本
System.out.println(cell.text().trim());
}
}
}
return text;
}
/**
* 也可以自己写方法
* @param imgByte
* @throws Exception
*/
public static void copyByteToFile(byte[] imgByte,String path) throws Exception {
InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
byte[] buff = new byte[1024];
String fileName = UUID.randomUUID().toString().substring(0, 6);
OutputStream out = new FileOutputStream(new File(path + fileName + ".jpg"));
int len = 0;
while ((len = in.read(buff)) > 0) {
out.write(buff, 0, len);
}
out.flush();
out.close();
in.close();
}
/**
* 通过Picture 自己类中的读写方法
* @param pics
* @param path
*/
public static void copyPic2Disk(List<Picture> pics,File path){
if(pics == null || pics.size() <=0){
return;
}
if(!path.isDirectory()){
throw new RuntimeException("路径填写不正确");
}
//当文件夹路径不存在的情况下,我们自己创建文件夹目录
if(!path.exists() ){
path.mkdirs();
}
try {
for(Picture pic:pics){
//写出数据,我们使用的是Poi类中,Picture自己所带的函数
pic.writeImageContent(new FileOutputStream(new File(path,pic.suggestFullFileName())));
/*byte [] picBytes = pic.getContent(); //获取字节流,也可以自己写入数据
copyByteToFile(picBytes);*/
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
5、CosineSimilarAlgorithm 获取两个文件相似性
package com.graduation.util;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class CosineSimilarAlgorithm {
/**
*
* @Title: cosSimilarityByFile
* @Description: 获取两个文件相似性
* @param @param firstFile
* @param @param secondFile
* @param @return
* @return Double
* @throws
*/
public static Double cosSimilarityByFile(String firstFile,String secondFile){
try{
Map<String, Map<String, Integer>> firstTfMap=TfIdfAlgorithm.wordSegCount(firstFile);
Map<String, Map<String, Integer>> secondTfMap=TfIdfAlgorithm.wordSegCount(secondFile);
if(firstTfMap==null || firstTfMap.size()==0){
throw new IllegalArgumentException("firstFile not found or firstFile is empty! ");
}
if(secondTfMap==null || secondTfMap.size()==0){
throw new IllegalArgumentException("secondFile not found or secondFile is empty! ");
}
Map<String,Integer> firstWords=firstTfMap.get(firstFile);
Map<String,Integer> secondWords=secondTfMap.get(secondFile);
if(firstWords.size()<secondWords.size()){
Map<String, Integer> temp=firstWords;
firstWords=secondWords;
secondWords=temp;
}
return calculateCos((LinkedHashMap<String, Integer>)firstWords, (LinkedHashMap<String, Integer>)secondWords);
}catch(Exception e){
e.printStackTrace();
}
return 0d;
}
/**
*
* @Title: cosSimilarityByString
* @Description: 得到两个字符串的相似性
* @param @param first
* @param @param second
* @param @return
* @return Double
* @throws
*/
public static Double cosSimilarityByString(String first,String second){
try{
Map<String, Integer> firstTfMap=TfIdfAlgorithm.segStr(first);
Set<String> set = firstTfMap.keySet();
String res = "";
for(String i:set) {
res = res+i;
}
//System.out.println(res);
System.out.println("------------------------");
Map<String, Integer> secondTfMap=TfIdfAlgorithm.segStr(second);
//
// for(int i=0;i<firstTfMap.size();i++) {
// System.out.print(secondTfMap.toString());
// }
System.out.println("------------------------");
if(firstTfMap.size()<secondTfMap.size()){
Map<String, Integer> temp=firstTfMap;
firstTfMap=secondTfMap;
secondTfMap=temp;
}
return calculateCos((LinkedHashMap<String, Integer>)firstTfMap, (LinkedHashMap<String, Integer>)secondTfMap);
}catch(Exception e){
e.printStackTrace();
}
return 0d;
}
/**
*
* @Title: calculateCos
* @Description: 计算余弦相似性
* @param @param first
* @param @param second
* @param @return
* @return Double
* @throws
*/
private static Double calculateCos(LinkedHashMap<String, Integer> first,LinkedHashMap<String, Integer> second){
List<Map.Entry<String, Integer>> firstList = new ArrayList<Map.Entry<String, Integer>>(first.entrySet());
List<Map.Entry<String, Integer>> secondList = new ArrayList<Map.Entry<String, Integer>>(second.entrySet());
//计算相似度
double vectorFirstModulo = 0.00;//向量1的模
double vectorSecondModulo = 0.00;//向量2的模
double vectorProduct = 0.00; //向量积
int secondSize=second.size();
for(int i=0;i<firstList.size();i++){
if(i<secondSize){
vectorSecondModulo+=secondList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
vectorProduct+=firstList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
}
vectorFirstModulo+=firstList.get(i).getValue().doubleValue()*firstList.get(i).getValue().doubleValue();
}
return vectorProduct/(Math.sqrt(vectorFirstModulo)*Math.sqrt(vectorSecondModulo));
}
public static void main(String[] args){
Double result=cosSimilarityByString("三网融合又可被称为“数位汇流”,是将电信网、计算机互联网和有线电视网三者互联互通,融合发展,从而为用户提供语音、数据和广播电视等服务, 伴随着通信行业加快发展,传统的三网融合已逐渐成为当前互联网发展的趋势。"
,"三网融合是指电信网、广播电视网、互联网在向宽带通信网、数字电视网、下一代互联网演进过程中,三大网络通过技术改造,其技术功能趋于一致,业务范围趋于相同,网络互联互通、资源共享,能为用户提供语音、数据和广播电视等多种服务。三合并不意味着三大网络的物理合一,而主要是指高层业务应用的融合。三网融合应用广泛,遍及智能交通、环境保护、政府工作、公共安全、平安家居等多个领域。以后的手机可以看电视、上网,电视可以打电话、上网,电脑也可以打电话、看电视。三者之间相互交叉,形成你中有我、我中有你的格局。");
System.out.println(result);
}
}
6、TfIdfAlgorithm 统计单词的TF-IDF
package com.graduation.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class TfIdfAlgorithm {
/**
* 文件名保存在list
*/
private static List<String> fileList = new ArrayList<String>();
/**
* 所有文件tf结果.key:文件名,value:该文件tf
*/
private static Map<String, Map<String, Double>> allTfMap = new HashMap<String, Map<String, Double>>();
/**
* 所有文件分词结果.key:文件名,value:该文件分词统计
*/
private static Map<String, Map<String, Integer>> allSegsMap = new HashMap<String, Map<String, Integer>>();
/**
* 所有文件分词的idf结果.key:文件名,value:词w在整个文档集合中的逆向文档频率idf (Inverse Document Frequency),即文档总数n与词w所出现文件数docs(w, D)比值的对数
*/
private static Map<String, Double> idfMap = new HashMap<String, Double>();
/**
* 统计包含单词的文档数 key:单词 value:包含该词的文档数
*/
private static Map<String, Integer> containWordOfAllDocNumberMap=new HashMap<String, Integer>();
/**
* 统计单词的TF-IDF
* key:文件名 value:该文件tf-idf
*/
private static Map<String, Map<String, Double>> tfIdfMap = new HashMap<String, Map<String, Double>>();
/**
*
* @Title: readDirs
* @Description: 递归获取文件
* @param @param filepath
* @param @return List<String>
* @param @throws FileNotFoundException
* @param @throws IOException
* @return List<String>
* @throws
*/
private static List<String> readDirs(String filepath) throws FileNotFoundException, IOException {
try {
File file = new File(filepath);
if (!file.isDirectory()) {
System.out.println("输入的参数应该为[文件夹名]");
System.out.println("filepath: " + file.getAbsolutePath());
} else if (file.isDirectory()) {
String[] filelist = file.list();
for (int i = 0; i < filelist.length; i++) {
File readfile = new File(filepath + File.separator + filelist[i]);
if (!readfile.isDirectory()) {
fileList.add(readfile.getAbsolutePath());
} else if (readfile.isDirectory()) {
readDirs(filepath + File.separator + filelist[i]);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return fileList;
}
/**
*
* @Title: readFile
* @Description: 读取文件转化成string
* @param @param file
* @param @return String
* @param @throws FileNotFoundException
* @param @throws IOException
* @return String
* @throws
*/
private static String readFile(String file) throws FileNotFoundException, IOException {
StringBuffer sb = new StringBuffer();
InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");
BufferedReader br = new BufferedReader(is);
String line = br.readLine();
while (line != null) {
sb.append(line).append("\r\n");
line = br.readLine();
}
br.close();
return sb.toString();
}
/**
*
* @Title: segString
* @Description: 用ik进行字符串分词,统计各个词出现的次数
* @param @param content
* @param @return Map<String, Integer>
* @return Map<String,Integer>
* @throws
*/
private static Map<String, Integer> segString(String content){
// 分词
Reader input = new StringReader(content);
// 智能分词关闭(对分词的精度影响很大)
IKSegmenter iks = new IKSegmenter(input, true);
Lexeme lexeme = null;
Map<String, Integer> words = new HashMap<String, Integer>();
try {
while ((lexeme = iks.next()) != null) {
if (words.containsKey(lexeme.getLexemeText())) {
words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
} else {
words.put(lexeme.getLexemeText(), 1);
}
}
}catch(IOException e) {
e.printStackTrace();
}
return words;
}
/**
*
* @Title: segStr
* @Description: 返回LinkedHashMap的分词
* @param @param content
* @param @return
* @return Map<String,Integer>
* @throws
*/
public static Map<String, Integer> segStr(String content){
// 分词
Reader input = new StringReader(content);
// 智能分词关闭(对分词的精度影响很大)
IKSegmenter iks = new IKSegmenter(input, true);
Lexeme lexeme = null;
Map<String, Integer> words = new LinkedHashMap<String, Integer>();
try {
while ((lexeme = iks.next()) != null) {
if (words.containsKey(lexeme.getLexemeText())) {
words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
} else {
words.put(lexeme.getLexemeText(), 1);
}
}
}catch(IOException e) {
e.printStackTrace();
}
return words;
}
public static Map<String, Integer> getMostFrequentWords(int num,Map<String, Integer> words){
Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();
int count=0;
// 词频统计
List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(words.entrySet());
Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
return obj2.getValue() - obj1.getValue();
}
});
// 高频词输出
for (int j = 0; j < info.size(); j++) {
// 词-->频
if(info.get(j).getKey().length()>1){
if(num>count){
keywords.put(info.get(j).getKey(), info.get(j).getValue());
count++;
}else{
break;
}
}
}
return keywords;
}
/**
*
* @Title: tf
* @Description: 分词结果转化为tf,公式为:tf(w,d) = count(w, d) / size(d)
* 即词w在文档d中出现次数count(w, d)和文档d中总词数size(d)的比值
* @param @param segWordsResult
* @param @return
* @return HashMap<String,Double>
* @throws
*/
private static HashMap<String, Double> tf(Map<String, Integer> segWordsResult) {
HashMap<String, Double> tf = new HashMap<String, Double>();// 正规化
if(segWordsResult==null || segWordsResult.size()==0){
return tf;
}
Double size=Double.valueOf(segWordsResult.size());
Set<String> keys=segWordsResult.keySet();
for(String key: keys){
Integer value=segWordsResult.get(key);
tf.put(key, Double.valueOf(value)/size);
}
return tf;
}
/**
*
* @Title: allTf
* @Description: 得到所有文件的tf
* @param @param dir
* @param @return Map<String, Map<String, Double>>
* @return Map<String,Map<String,Double>>
* @throws
*/
public static Map<String, Map<String, Double>> allTf(String dir){
try{
fileList=readDirs(dir);
for(String filePath : fileList){
String content=readFile(filePath);
Map<String, Integer> segs=segString(content);
allSegsMap.put(filePath, segs);
allTfMap.put(filePath, tf(segs));
}
}catch(FileNotFoundException ffe){
ffe.printStackTrace();
}catch(IOException io){
io.printStackTrace();
}
return allTfMap;
}
/**
*
* @Title: wordSegCount
* @Description: 返回分词结果,以LinkedHashMap保存
* @param @param dir
* @param @return
* @return Map<String,Map<String,Integer>>
* @throws
*/
public static Map<String, Map<String, Integer>> wordSegCount(String dir){
try{
fileList=readDirs(dir);
for(String filePath : fileList){
String content=readFile(filePath);
Map<String, Integer> segs=segStr(content);
allSegsMap.put(filePath, segs);
}
}catch(FileNotFoundException ffe){
ffe.printStackTrace();
}catch(IOException io){
io.printStackTrace();
}
return allSegsMap;
}
/**
*
* @Title: containWordOfAllDocNumber
* @Description: 统计包含单词的文档数 key:单词 value:包含该词的文档数
* @param @param allSegsMap
* @param @return
* @return Map<String,Integer>
* @throws
*/
private static Map<String, Integer> containWordOfAllDocNumber(Map<String, Map<String, Integer>> allSegsMap){
if(allSegsMap==null || allSegsMap.size()==0){
return containWordOfAllDocNumberMap;
}
Set<String> fileList=allSegsMap.keySet();
for(String filePath: fileList){
Map<String, Integer> fileSegs=allSegsMap.get(filePath);
//获取该文件分词为空或为0,进行下一个文件
if(fileSegs==null || fileSegs.size()==0){
continue;
}
//统计每个分词的idf
Set<String> segs=fileSegs.keySet();
for(String seg : segs){
if (containWordOfAllDocNumberMap.containsKey(seg)) {
containWordOfAllDocNumberMap.put(seg, containWordOfAllDocNumberMap.get(seg) + 1);
} else {
containWordOfAllDocNumberMap.put(seg, 1);
}
}
}
return containWordOfAllDocNumberMap;
}
/**
*
* @Title: idf
* @Description: idf = log(n / docs(w, D))
* @param @param containWordOfAllDocNumberMap
* @param @return Map<String, Double>
* @return Map<String,Double>
* @throws
*/
public static Map<String, Double> idf(Map<String, Map<String, Integer>> allSegsMap){
if(allSegsMap==null || allSegsMap.size()==0){
return idfMap;
}
containWordOfAllDocNumberMap=containWordOfAllDocNumber(allSegsMap);
Set<String> words=containWordOfAllDocNumberMap.keySet();
Double wordSize=Double.valueOf(containWordOfAllDocNumberMap.size());
for(String word: words){
Double number=Double.valueOf(containWordOfAllDocNumberMap.get(word));
idfMap.put(word, Math.log(wordSize/(number+1.0d)));
}
return idfMap;
}
/**
*
* @Title: tfIdf
* @Description: tf-idf
* @param @param tf,idf
* @return Map<String, Map<String, Double>>
* @throws
*/
public static Map<String, Map<String, Double>> tfIdf(Map<String, Map<String, Double>> allTfMap,Map<String, Double> idf){
Set<String> fileList=allTfMap.keySet();
for(String filePath : fileList){
Map<String, Double> tfMap=allTfMap.get(filePath);
Map<String, Double> docTfIdf=new HashMap<String,Double>();
Set<String> words=tfMap.keySet();
for(String word: words){
Double tfValue=Double.valueOf(tfMap.get(word));
Double idfValue=idf.get(word);
docTfIdf.put(word, tfValue*idfValue);
}
tfIdfMap.put(filePath, docTfIdf);
}
return tfIdfMap;
}
public static void main(String[] args){
System.out.println("tf--------------------------------------");
Map<String, Map<String, Double>> allTfMap=TfIdfAlgorithm.allTf("d://dir");
Set<String> fileList=allTfMap.keySet();
for(String filePath : fileList){
Map<String, Double> tfMap=allTfMap.get(filePath);
Set<String> words=tfMap.keySet();
for(String word: words){
System.out.println("fileName:"+filePath+" word:"+word+" tf:"+tfMap.get(word));
}
}
System.out.println("idf--------------------------------------");
Map<String, Double> idfMap=TfIdfAlgorithm.idf(allSegsMap);
Set<String> words=idfMap.keySet();
for(String word : words){
System.out.println("word:"+word+" tf:"+idfMap.get(word));
}
System.out.println("tf-idf--------------------------------------");
Map<String, Map<String, Double>> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
Set<String> files=tfIdfMap.keySet();
for(String filePath : files){
Map<String, Double> tfIdf=tfIdfMap.get(filePath);
Set<String> segs=tfIdf.keySet();
for(String word: segs){
System.out.println("fileName:"+filePath+" word:"+word+" tf-idf:"+tfIdf.get(word));
}
}
}
}