余弦定理对比文本相似度实现查重

1、在pom.xml中添加分词器与word读取依赖

<!-- ik.中文分词器依赖-->
  <dependency>
      <groupId>com.janeluo</groupId>
      <artifactId>ikanalyzer</artifactId>
      <version>2012_u6</version>
  </dependency>
  <!-- lucene依赖 -->
  <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-highlighter</artifactId>
      <version>4.7.2</version>
  </dependency>
<!--word读取-->
  <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>3.14-beta1</version>
  </dependency>

2、jsp 使用

<button type="button" style="width: 8%;outline: none;margin-left: 1.5%;margin-bottom: 15px" onclick="ExerciseCheck()" class="btn btn-primary">作业查重</button>
<script type="text/javascript">
function ExerciseCheck() {
    var CourseID = $("#ds_course").val();
    var ChapterID = $("#ds_cnumber").val();
    var MinChapterID = $("#ds_snumber").val();
    $.ajax({
        type:"POST",
        url:"/exercise/ExerciseRecheck",
        dataType:"json",
        data:{
            CourseID:CourseID,
            ChapterID:ChapterID,
            MinChapterID:MinChapterID
        },
        async:false,
        success: function(data){
            $("#listExport").html($("#listExport").html()
                + "<a style=\"width: 100%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px\" href=\"/excel/ListExports?CourseID="+CourseID+"&ChapterID="+ChapterID+"&MinChapterID="+MinChapterID+"\" class=\"btn btn-primary\">导出名单</a>"
            );
            openCheckWin();
            document.getElementById('checkView').innerHTML = "";
            if (data == null || data == ""){
                $("#checkView").html($("#checkView").html()
                    + "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
                    + "<span>暂无内容</span>"
                    + "</li>"
                );
            }else{
                var json = eval(data);
                $.each(json, function (index) {
                    var DetectionUserID = json[index].detectionUserID;
                    var DetectionUserName = json[index].detectionUserName;
                    var MeasuredUserID = json[index].measuredUserID;
                    var MeasuredUserName = json[index].measuredUserName;
                    var Similarity = json[index].similarity;
                    $("#checkView").html($("#checkView").html()
                        + "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
                        + "<span>" +"学号:"+ DetectionUserID + "&nbsp;&nbsp;" +"姓名:"+ DetectionUserName + "</span>"
                        + "</li>"
                    );
                });
            }
        }
    });
}
    function openCheckWin(){
    document.getElementById("CheckWin").style.display = "block";
}
</script>
<div class="floatingWin" style="border-radius: 5px;margin-left: 28%;width: 40%;display: none;position: absolute;background: #FFFFFF;height: 450px;z-index: 111111111111111111111111111111" id="CheckWin">
    <div id="listExport" style="width: 13%;float: left;margin-left: 1.5%">

    </div>
    <button type="button" style="width: 14%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px" onclick="closeCheckWin()" class="btn btn-primary">关闭</button>


    <div class="form-group">
        <span class="text-muted" style="margin-left: 1.5%">疑似抄袭名单</span>
        <ul class="list-group" id="checkView" style="overflow: auto">

        </ul>
    </div>
</div>

3、controller

@ResponseBody
    @RequestMapping("/ExerciseRecheck")
    public List<ExerciseCheck> ExerciseRecheck(String CourseID,String ChapterID,String MinChapterID,HttpServletRequest request) throws Exception {
        List<Exercise> list1 = exerciseService.QuerySectionExercise(CourseID,ChapterID,MinChapterID);
        List<Exercise> list2 = list1;
        List<ExerciseCheck> exerciseChecks = new ArrayList<ExerciseCheck>();
        if(list1.size() < 2){
            System.out.println("作业数小于2无法查重!");
        }else {
            int l = 0;
            for(int i = 0;i < list1.size();i++){
                String file1 = new WordRead().readWord(list1.get(i).getChapterExercise(),request).replaceAll("\r|\n", "");
                for (int j = 0;j<list2.size();j++){
                    if( i != j){
                        String file2 = new WordRead().readWord(list2.get(j).getChapterExercise(),request).replaceAll("\r|\n", "");
                        Double f = new CosineSimilarAlgorithm().cosSimilarityByString(file1,file2);
                        if(f > 0.6){
                            ExerciseCheck ec = new ExerciseCheck();
                            ec.setDetectionUserID(list1.get(i).getUserID());
                            ec.setDetectionUserName(list1.get(i).getUserName());
                            ec.setMeasuredUserID(list2.get(j).getUserID());
                            ec.setMeasuredUserName(list2.get(j).getUserName());
                            ec.setSimilarity(f.toString());
                            exerciseChecks.add(l,ec);
                            l++;
                            continue;
                        }
                    }
                }
            }
        }
        return exerciseChecks;
    }

4、读取word文件内容

	package com.graduation.util;
	
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.util.List;
	import java.util.UUID;
	
	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.hwpf.usermodel.Picture;
	import org.apache.poi.hwpf.usermodel.Range;
	import org.apache.poi.hwpf.usermodel.Table;
	import org.apache.poi.hwpf.usermodel.TableCell;
	import org.apache.poi.hwpf.usermodel.TableIterator;
	import org.apache.poi.hwpf.usermodel.TableRow;
	
	import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
	
	import javax.servlet.http.HttpServletRequest;
	
	public class WordRead {
	
	    public static String readWord(String filename,HttpServletRequest request) throws Exception{
	        String path = request.getServletContext().getRealPath("");
	        System.out.println(path);
	        String FilePath=path + "\\static\\exercises\\";//从我们的上传文件夹中去取
	        String BASE_PATH = FilePath;
	        filename = filename+".doc";
	        File file = new File(BASE_PATH + filename);
	        System.out.println(BASE_PATH + filename);
	        HWPFDocument doc = new HWPFDocument(new FileInputStream(file));
	
	        //通过 Doc对象直接获取Text
	        StringBuilder sb = doc.getText();
	//        System.out.println("文本:"+sb.toString());
	
	        //通过Range对象获取Text
	        Range range = doc.getRange();
	        String text = range.text();
	//        System.out.println(text);
	
	        //获取段落数目
	        //在Word中,一个回车符就是一个段落了
	        int nums = range.numParagraphs();
	//        System.out.println("多少个段落:"+nums);
	
	        //获取doc中的图片数
	        List<Picture> pics = doc.getPicturesTable().getAllPictures();
	
	        for(Picture pic:pics){
	            //图片在doc文件中的位置,分析Doc 转化成其他文本时需要用到
	            int start = pic.getStartOffset();
	            int width = pic.getWidth();
	            int height = pic.getHeight();
	            String mimeType = pic.getMimeType();
	
	            System.out.printf("开始位置%d\t图片大小度%d,高%d,\t图片类型%s\r\n",start,width,height,mimeType);
	        }
	        //1.通过Picture的writeImageContent方法 写文件
	        //2.获取Picture的byte 自己写
	        copyPic2Disk(pics, new File(BASE_PATH));
	
	
	        //遍历range范围内的table。
	        TableIterator tableIter = new TableIterator(range);
	        while (tableIter.hasNext()) {
	            Table table = tableIter.next();
	            //开始位置
	            int start = table.getStartOffset();
	            //结束位置
	            int end = table.getEndOffset();
	            System.out.printf("开始位置%d,结束为止%d\r\n",start,end);
	
	            //获取行的数目
	            int rowNum = table.numRows();
	            for (int j = 0; j < rowNum; j++) {
	                //获取每一行
	                TableRow row = table.getRow(j);
	                int cellNum = row.numCells();
	                for (int k = 0; k < cellNum; k++) {
	                    //获取每一列
	                    TableCell cell = row.getCell(k);
	                    // 输出单元格的文本
	                    System.out.println(cell.text().trim());
	                }
	            }
	        }
	        return text;
	    }
	
	    /**
	     * 也可以自己写方法
	     * @param imgByte
	     * @throws Exception
	     */
	    public static void copyByteToFile(byte[] imgByte,String path) throws Exception {
	
	        InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
	        byte[] buff = new byte[1024];
	        String fileName = UUID.randomUUID().toString().substring(0, 6);
	        OutputStream out = new FileOutputStream(new File(path + fileName + ".jpg"));
	
	        int len = 0;
	        while ((len = in.read(buff)) > 0) {
	            out.write(buff, 0, len);
	        }
	
	        out.flush();
	        out.close();
	        in.close();
	    }
	
	    /**
	     * 通过Picture 自己类中的读写方法
	     * @param pics
	     * @param path
	     */
	    public static void copyPic2Disk(List<Picture> pics,File path){
	        if(pics == null  || pics.size()  <=0){
	            return;
	        }
	        if(!path.isDirectory()){
	            throw new RuntimeException("路径填写不正确");
	        }
	        //当文件夹路径不存在的情况下,我们自己创建文件夹目录
	        if(!path.exists() ){
	            path.mkdirs();
	        }
	
	        try {
	            for(Picture pic:pics){
	                //写出数据,我们使用的是Poi类中,Picture自己所带的函数
	                pic.writeImageContent(new FileOutputStream(new File(path,pic.suggestFullFileName())));
	                /*byte [] picBytes = pic.getContent(); //获取字节流,也可以自己写入数据
	                copyByteToFile(picBytes);*/
	            }
	        } catch (Exception e) {
	            // TODO Auto-generated catch block
	            e.printStackTrace();
	        }
	    }
	}

5、CosineSimilarAlgorithm 获取两个文件相似性

	package com.graduation.util;
	import java.util.ArrayList;
	import java.util.LinkedHashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	public class CosineSimilarAlgorithm {
	
	    /**
	     *
	     * @Title: cosSimilarityByFile
	     * @Description: 获取两个文件相似性
	     * @param @param firstFile
	     * @param @param secondFile
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    public static Double cosSimilarityByFile(String firstFile,String secondFile){
	        try{
	            Map<String, Map<String, Integer>> firstTfMap=TfIdfAlgorithm.wordSegCount(firstFile);
	            Map<String, Map<String, Integer>> secondTfMap=TfIdfAlgorithm.wordSegCount(secondFile);
	            if(firstTfMap==null || firstTfMap.size()==0){
	                throw new IllegalArgumentException("firstFile not found or firstFile is empty! ");
	            }
	            if(secondTfMap==null || secondTfMap.size()==0){
	                throw new IllegalArgumentException("secondFile not found or secondFile is empty! ");
	            }
	            Map<String,Integer> firstWords=firstTfMap.get(firstFile);
	            Map<String,Integer> secondWords=secondTfMap.get(secondFile);
	            if(firstWords.size()<secondWords.size()){
	                Map<String, Integer> temp=firstWords;
	                firstWords=secondWords;
	                secondWords=temp;
	            }
	            return calculateCos((LinkedHashMap<String, Integer>)firstWords, (LinkedHashMap<String, Integer>)secondWords);
	
	        }catch(Exception e){
	            e.printStackTrace();
	        }
	        return 0d;
	    }
	
	    /**
	     *
	     * @Title: cosSimilarityByString
	     * @Description: 得到两个字符串的相似性
	     * @param @param first
	     * @param @param second
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    public static Double cosSimilarityByString(String first,String second){
	        try{
	            Map<String, Integer> firstTfMap=TfIdfAlgorithm.segStr(first);
	            Set<String> set = firstTfMap.keySet();
	            String res = "";
	            for(String i:set) {
	                res = res+i;
	            }
	            //System.out.println(res);
	            System.out.println("------------------------");
	
	            Map<String, Integer> secondTfMap=TfIdfAlgorithm.segStr(second);
	//
	//			for(int i=0;i<firstTfMap.size();i++) {
	//				System.out.print(secondTfMap.toString());
	//			}
	            System.out.println("------------------------");
	            if(firstTfMap.size()<secondTfMap.size()){
	                Map<String, Integer> temp=firstTfMap;
	                firstTfMap=secondTfMap;
	                secondTfMap=temp;
	
	            }
	
	            return calculateCos((LinkedHashMap<String, Integer>)firstTfMap, (LinkedHashMap<String, Integer>)secondTfMap);
	
	        }catch(Exception e){
	            e.printStackTrace();
	        }
	        return 0d;
	    }
	
	    /**
	     *
	     * @Title: calculateCos
	     * @Description: 计算余弦相似性
	     * @param @param first
	     * @param @param second
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    private static Double calculateCos(LinkedHashMap<String, Integer> first,LinkedHashMap<String, Integer> second){
	
	        List<Map.Entry<String, Integer>> firstList = new ArrayList<Map.Entry<String, Integer>>(first.entrySet());
	        List<Map.Entry<String, Integer>> secondList = new ArrayList<Map.Entry<String, Integer>>(second.entrySet());
	        //计算相似度
	        double vectorFirstModulo = 0.00;//向量1的模
	        double vectorSecondModulo = 0.00;//向量2的模
	        double vectorProduct = 0.00; //向量积
	        int secondSize=second.size();
	        for(int i=0;i<firstList.size();i++){
	            if(i<secondSize){
	                vectorSecondModulo+=secondList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
	                vectorProduct+=firstList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
	            }
	            vectorFirstModulo+=firstList.get(i).getValue().doubleValue()*firstList.get(i).getValue().doubleValue();
	        }
	        return vectorProduct/(Math.sqrt(vectorFirstModulo)*Math.sqrt(vectorSecondModulo));
	    }
	
	    public static void main(String[] args){
	        Double result=cosSimilarityByString("三网融合又可被称为“数位汇流”,是将电信网、计算机互联网和有线电视网三者互联互通,融合发展,从而为用户提供语音、数据和广播电视等服务, 伴随着通信行业加快发展,传统的三网融合已逐渐成为当前互联网发展的趋势。"
	                ,"三网融合是指电信网、广播电视网、互联网在向宽带通信网、数字电视网、下一代互联网演进过程中,三大网络通过技术改造,其技术功能趋于一致,业务范围趋于相同,网络互联互通、资源共享,能为用户提供语音、数据和广播电视等多种服务。三合并不意味着三大网络的物理合一,而主要是指高层业务应用的融合。三网融合应用广泛,遍及智能交通、环境保护、政府工作、公共安全、平安家居等多个领域。以后的手机可以看电视、上网,电视可以打电话、上网,电脑也可以打电话、看电视。三者之间相互交叉,形成你中有我、我中有你的格局。");
	        System.out.println(result);
	    }
	}

6、TfIdfAlgorithm 统计单词的TF-IDF

package com.graduation.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

public class TfIdfAlgorithm {
    /**
     * 文件名保存在list
     */
    private static List<String> fileList = new ArrayList<String>();
    /**
     * 所有文件tf结果.key:文件名,value:该文件tf
     */
    private static Map<String, Map<String, Double>> allTfMap = new HashMap<String, Map<String, Double>>();

    /**
     * 所有文件分词结果.key:文件名,value:该文件分词统计
     */
    private static Map<String, Map<String, Integer>> allSegsMap = new HashMap<String, Map<String, Integer>>();

    /**
     * 所有文件分词的idf结果.key:文件名,value:词w在整个文档集合中的逆向文档频率idf (Inverse Document Frequency),即文档总数n与词w所出现文件数docs(w, D)比值的对数
     */
    private static Map<String, Double> idfMap = new HashMap<String, Double>();

    /**
     * 统计包含单词的文档数  key:单词  value:包含该词的文档数
     */
    private static Map<String, Integer> containWordOfAllDocNumberMap=new HashMap<String, Integer>();

    /**
     * 统计单词的TF-IDF
     * key:文件名 value:该文件tf-idf
     */
    private static Map<String, Map<String, Double>> tfIdfMap = new HashMap<String, Map<String, Double>>();


    /**
     *
     * @Title: readDirs
     * @Description: 递归获取文件
     * @param @param filepath
     * @param @return List<String>
     * @param @throws FileNotFoundException
     * @param @throws IOException
     * @return List<String>
     * @throws
     */
    private static List<String> readDirs(String filepath) throws FileNotFoundException, IOException {
        try {
            File file = new File(filepath);
            if (!file.isDirectory()) {
                System.out.println("输入的参数应该为[文件夹名]");
                System.out.println("filepath: " + file.getAbsolutePath());
            } else if (file.isDirectory()) {
                String[] filelist = file.list();
                for (int i = 0; i < filelist.length; i++) {
                    File readfile = new File(filepath + File.separator + filelist[i]);
                    if (!readfile.isDirectory()) {
                        fileList.add(readfile.getAbsolutePath());
                    } else if (readfile.isDirectory()) {
                        readDirs(filepath + File.separator + filelist[i]);
                    }
                }
            }

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return fileList;
    }

    /**
     *
     * @Title: readFile
     * @Description: 读取文件转化成string
     * @param @param file
     * @param @return String
     * @param @throws FileNotFoundException
     * @param @throws IOException
     * @return String
     * @throws
     */
    private static String readFile(String file) throws FileNotFoundException, IOException {
        StringBuffer sb = new StringBuffer();
        InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");
        BufferedReader br = new BufferedReader(is);
        String line = br.readLine();
        while (line != null) {
            sb.append(line).append("\r\n");
            line = br.readLine();
        }
        br.close();
        return sb.toString();
    }


    /**
     *
     * @Title: segString
     * @Description: 用ik进行字符串分词,统计各个词出现的次数
     * @param @param content
     * @param @return  Map<String, Integer>
     * @return Map<String,Integer>
     * @throws
     */
    private static Map<String, Integer> segString(String content){
        // 分词
        Reader input = new StringReader(content);
        // 智能分词关闭(对分词的精度影响很大)
        IKSegmenter iks = new IKSegmenter(input, true);
        Lexeme lexeme = null;
        Map<String, Integer> words = new HashMap<String, Integer>();
        try {
            while ((lexeme = iks.next()) != null) {
                if (words.containsKey(lexeme.getLexemeText())) {
                    words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
                } else {
                    words.put(lexeme.getLexemeText(), 1);
                }
            }
        }catch(IOException e) {
            e.printStackTrace();
        }
        return words;
    }

    /**
     *
     * @Title: segStr
     * @Description: 返回LinkedHashMap的分词
     * @param @param content
     * @param @return
     * @return Map<String,Integer>
     * @throws
     */
    public static Map<String, Integer> segStr(String content){
        // 分词
        Reader input = new StringReader(content);
        // 智能分词关闭(对分词的精度影响很大)
        IKSegmenter iks = new IKSegmenter(input, true);
        Lexeme lexeme = null;
        Map<String, Integer> words = new LinkedHashMap<String, Integer>();
        try {
            while ((lexeme = iks.next()) != null) {
                if (words.containsKey(lexeme.getLexemeText())) {
                    words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
                } else {
                    words.put(lexeme.getLexemeText(), 1);
                }
            }
        }catch(IOException e) {
            e.printStackTrace();
        }
        return words;
    }

    public static Map<String, Integer> getMostFrequentWords(int num,Map<String, Integer> words){

        Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();
        int count=0;
        // 词频统计
        List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(words.entrySet());
        Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {
            public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
                return obj2.getValue() - obj1.getValue();
            }
        });

        // 高频词输出
        for (int j = 0; j < info.size(); j++) {
            // 词-->频
            if(info.get(j).getKey().length()>1){
                if(num>count){
                    keywords.put(info.get(j).getKey(), info.get(j).getValue());
                    count++;
                }else{
                    break;
                }
            }
        }
        return keywords;
    }

    /**
     *
     * @Title: tf
     * @Description: 分词结果转化为tf,公式为:tf(w,d) = count(w, d) / size(d)
     * 即词w在文档d中出现次数count(w, d)和文档d中总词数size(d)的比值
     * @param @param segWordsResult
     * @param @return
     * @return HashMap<String,Double>
     * @throws
     */
    private static HashMap<String, Double> tf(Map<String, Integer> segWordsResult) {

        HashMap<String, Double> tf = new HashMap<String, Double>();// 正规化
        if(segWordsResult==null || segWordsResult.size()==0){
            return tf;
        }
        Double size=Double.valueOf(segWordsResult.size());
        Set<String> keys=segWordsResult.keySet();
        for(String key: keys){
            Integer value=segWordsResult.get(key);
            tf.put(key, Double.valueOf(value)/size);
        }
        return tf;
    }

    /**
     *
     * @Title: allTf
     * @Description: 得到所有文件的tf
     * @param @param dir
     * @param @return Map<String, Map<String, Double>>
     * @return Map<String,Map<String,Double>>
     * @throws
     */
    public static Map<String, Map<String, Double>> allTf(String dir){
        try{
            fileList=readDirs(dir);
            for(String filePath : fileList){
                String content=readFile(filePath);
                Map<String, Integer> segs=segString(content);
                allSegsMap.put(filePath, segs);
                allTfMap.put(filePath, tf(segs));
            }
        }catch(FileNotFoundException ffe){
            ffe.printStackTrace();
        }catch(IOException io){
            io.printStackTrace();
        }
        return allTfMap;
    }

    /**
     *
     * @Title: wordSegCount
     * @Description: 返回分词结果,以LinkedHashMap保存
     * @param @param dir
     * @param @return
     * @return Map<String,Map<String,Integer>>
     * @throws
     */
    public static Map<String, Map<String, Integer>> wordSegCount(String dir){
        try{
            fileList=readDirs(dir);
            for(String filePath : fileList){
                String content=readFile(filePath);
                Map<String, Integer> segs=segStr(content);
                allSegsMap.put(filePath, segs);
            }
        }catch(FileNotFoundException ffe){
            ffe.printStackTrace();
        }catch(IOException io){
            io.printStackTrace();
        }
        return allSegsMap;
    }


    /**
     *
     * @Title: containWordOfAllDocNumber
     * @Description: 统计包含单词的文档数  key:单词  value:包含该词的文档数
     * @param @param allSegsMap
     * @param @return
     * @return Map<String,Integer>
     * @throws
     */
    private static Map<String, Integer> containWordOfAllDocNumber(Map<String, Map<String, Integer>> allSegsMap){
        if(allSegsMap==null || allSegsMap.size()==0){
            return containWordOfAllDocNumberMap;
        }

        Set<String> fileList=allSegsMap.keySet();
        for(String filePath: fileList){
            Map<String, Integer> fileSegs=allSegsMap.get(filePath);
            //获取该文件分词为空或为0,进行下一个文件
            if(fileSegs==null || fileSegs.size()==0){
                continue;
            }
            //统计每个分词的idf
            Set<String> segs=fileSegs.keySet();
            for(String seg : segs){
                if (containWordOfAllDocNumberMap.containsKey(seg)) {
                    containWordOfAllDocNumberMap.put(seg, containWordOfAllDocNumberMap.get(seg) + 1);
                } else {
                    containWordOfAllDocNumberMap.put(seg, 1);
                }
            }

        }
        return containWordOfAllDocNumberMap;
    }

    /**
     *
     * @Title: idf
     * @Description: idf = log(n / docs(w, D))
     * @param @param containWordOfAllDocNumberMap
     * @param @return Map<String, Double>
     * @return Map<String,Double>
     * @throws
     */
    public static Map<String, Double> idf(Map<String, Map<String, Integer>> allSegsMap){
        if(allSegsMap==null || allSegsMap.size()==0){
            return idfMap;
        }
        containWordOfAllDocNumberMap=containWordOfAllDocNumber(allSegsMap);
        Set<String> words=containWordOfAllDocNumberMap.keySet();
        Double wordSize=Double.valueOf(containWordOfAllDocNumberMap.size());
        for(String word: words){
            Double number=Double.valueOf(containWordOfAllDocNumberMap.get(word));
            idfMap.put(word, Math.log(wordSize/(number+1.0d)));
        }
        return idfMap;
    }

    /**
     *
     * @Title: tfIdf
     * @Description: tf-idf
     * @param @param tf,idf
     * @return Map<String, Map<String, Double>>
     * @throws
     */
    public static Map<String, Map<String, Double>> tfIdf(Map<String, Map<String, Double>> allTfMap,Map<String, Double> idf){

        Set<String> fileList=allTfMap.keySet();
        for(String filePath : fileList){
            Map<String, Double> tfMap=allTfMap.get(filePath);
            Map<String, Double> docTfIdf=new HashMap<String,Double>();
            Set<String> words=tfMap.keySet();
            for(String word: words){
                Double tfValue=Double.valueOf(tfMap.get(word));
                Double idfValue=idf.get(word);
                docTfIdf.put(word, tfValue*idfValue);
            }
            tfIdfMap.put(filePath, docTfIdf);
        }
        return tfIdfMap;
    }


    public static void main(String[] args){

        System.out.println("tf--------------------------------------");
        Map<String, Map<String, Double>> allTfMap=TfIdfAlgorithm.allTf("d://dir");
        Set<String> fileList=allTfMap.keySet();
        for(String filePath : fileList){
            Map<String, Double> tfMap=allTfMap.get(filePath);
            Set<String> words=tfMap.keySet();
            for(String word: words){
                System.out.println("fileName:"+filePath+"     word:"+word+"      tf:"+tfMap.get(word));
            }
        }

        System.out.println("idf--------------------------------------");
        Map<String, Double> idfMap=TfIdfAlgorithm.idf(allSegsMap);
        Set<String> words=idfMap.keySet();
        for(String word : words){
            System.out.println("word:"+word+"     tf:"+idfMap.get(word));
        }

        System.out.println("tf-idf--------------------------------------");
        Map<String, Map<String, Double>> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
        Set<String> files=tfIdfMap.keySet();
        for(String filePath : files){
            Map<String, Double> tfIdf=tfIdfMap.get(filePath);
            Set<String> segs=tfIdf.keySet();
            for(String word: segs){
                System.out.println("fileName:"+filePath+"     word:"+word+"        tf-idf:"+tfIdf.get(word));
            }
        }
    }
}
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值