java 实现计算tfidf 使用ik分词

最新推荐文章于 2021-02-24 20:14:51 发布

kiwi0933

最新推荐文章于 2021-02-24 20:14:51 发布

阅读量578

点赞数

分类专栏： java，tfidf，nlp

本文链接：https://blog.csdn.net/kiwi0933/article/details/77843841

版权

java，tfidf，nlp 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

java 实现计算tfidf 使用ik分词

Doc类，即文档

package pojo;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

/**
 * Class description
 *
 *
 * @version        Enter version here..., 17/09/04
 * @author         Enter your name here...
 */
public class Doc {

    /** 是否已分词标记 */
    private boolean isSeg = false;

    /** 标题 */
    private String title;

    /** url */
    private String url;

    /** 文档内容 */
    private String content;

    /** 文档编号 */
    private String docno;

    /** 分词后词项列表 */
    private List<String> segs;

    /** 词项计数Map */
    private Map<String, Integer> segMap;

    /**
     * Constructs ...
     *
     */
    public Doc() {}

    /**
     * Constructs ...
     *
     *
     * @param title
     * @param url
     * @param content 文档内容
     * @param docno 文档编号
     */
    public Doc(String url, String docno, String title,  String content) {
        this.title   = title;
        this.url     = url;
        this.content = content;
        this.docno   = docno;
    }

    /**
     * 分词
     * 对文档内容进行分词，使用ik分词器
     *
     */
    public void seg() {
        if (isSeg()){//判断是否已分词
            System.out.println("已分词");
            return;
        }

        Reader      reader      = new StringReader(content);
        IKSegmenter ikSegmenter = new IKSegmenter(reader, true);
        Lexeme      lexeme      = null;
        segs = new ArrayList<String>();
        segMap = new HashMap<String, Integer>();
        try {
            while ((lexeme = ikSegmenter.next()) != null) {
                System.out.println(lexeme.getLexemeText());
                segs.add(lexeme.getLexemeText());
                if (segMap.containsKey(lexeme.getLexemeText())) {
                    segMap.put(lexeme.getLexemeText(), segMap.get(lexeme.getLexemeText()) + 1);
                } else {
                    segMap.put(lexeme.getLexemeText(), 1);
                }
            }

            isSeg = true;
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public String toString() {
        return "Doc{" +
                "isSeg=" + isSeg +
                ", title='" + title + '\'' +
                ", url='" + url + '\'' +
                ", content='" + content + '\'' +
                ", docno='" + docno + '\'' +
                ", segs=" + segs +
                ", segMap=" + segMap +
                '}';
    }

    /**
     * Method description
     *
     *
     * @return
     */
    public String getContent() {
        return content;
    }

    /**
     * Method description
     *
     *
     * @return
     */
    public String getDocno() {
        return docno;
    }

    /**
     * 判断是否已分词
     *
     * @return
     */
    public boolean isSeg() {
        return isSeg;
    }

    /**
     * Method description
     *
     *
     * @return
     */
    public Map<String, Integer> getSegMap() {
        return segMap;
    }

    /**
     * Method description
     *
     *
     * @return
     */
    public List<String> getSegs() {
        return segs;
    }

    /**
     * Method description
     *
     *
     * @return
     */
    public String getTitle() {
        return title;
    }


    /**
     * Method description
     *
     *
     * @return
     */
    public String getUrl() {
        return url;
    }
}


//~ Formatted by Jindent --- http://www.jindent.com

TfIdfUtil类，即计算tfidf工具类

package util;

import java.io.*;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import pojo.Doc;

/**
 * 计算TfIdf
 * 根据给定语料库（这里采用搜狗中文新闻开放预料库）
 * 对文档集中各文档进行内容提取、分词
 * 分别计算tf idf tfidf
 * 并同时输出结果至控制台和目标文件
 *
 * @version        v0.1, 17/09/04
 * @author         Kiwi Liu
 */
public class TfIdfUtil {

    /** 文档集 */
    private static List<Doc> docList = new ArrayList<Doc>();

    /** 文档集中各文档的词项频率 */
    private static Map<String, Map<String, Double>> docSetTfMap = new HashMap<String, Map<String, Double>>();

    /** 文档集中各词项的文档频数 */
    private static Map<String, Integer> docSetDcMap = new HashMap<String, Integer>();

    /** 文档集中各词项的逆向文档频率 */
    private static Map<String, Double> docSetIdfMap = new HashMap<String, Double>();

    /**
     * 根据语料库文件创建文档集并对每个文档进行分词和词项计数
     * 这里利用正则表达式进行标题、url、标号、内容的抽取和去除标签
     *
     * @param corpusPath
     */
    private static void createDocSet(String corpusPath) {
        StringBuffer corpusBuffer = new StringBuffer();//语料库缓冲器
        String       line         = null;

        try {//读取语料库
            BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(corpusPath)));

            while ((line = bufferedReader.readLine()) != null) {
                corpusBuffer.append(line);
            }    //while

            //预处理部分
            //提取各个文档
            Pattern patternDoc = Pattern.compile("<doc>.*?</doc>");
            Matcher matcherDoc = patternDoc.matcher(corpusBuffer.toString());

            while (matcherDoc.find()) {
                String title   = null;
                String url     = null;
                String docno   = null;
                String content = null;

                String  doc          = matcherDoc.group();
                //提取标题
                Pattern patternTitle = Pattern.compile("<contenttitle>.*</contenttitle>");
                Matcher matcherTitle = patternTitle.matcher(doc);

                if (matcherTitle.find()) {
                    title = matcherTitle.group();
                    //去除标签
                    Pattern patternTag = Pattern.compile("<.*?>");
                    Matcher matcherTag = patternTag.matcher(title);

                    title = matcherTag.replaceAll(" ");
                    System.out.println("title: " + title);
                }
                //提取url
                Pattern patternUrl = Pattern.compile("<url>.*?</url>");
                Matcher matcherUrl = patternUrl.matcher(doc);

                if (matcherUrl.find()) {
                    url = matcherUrl.group();

                    Pattern patternTag = Pattern.compile("<.*?>");
                    Matcher matcherTag = patternTag.matcher(url);

                    url = matcherTag.replaceAll("");
                    System.out.println("url: " + url);
                }
                //提取文档标号
                Pattern patternDocno = Pattern.compile("<docno>.*?</docno>");
                Matcher matcherDocno = patternDocno.matcher(doc);

                if (matcherDocno.find()) {
                    docno = matcherDocno.group();

                    Pattern patternTag = Pattern.compile("<.*?>");
                    Matcher matcherTag = patternTag.matcher(docno);

                    docno = matcherTag.replaceAll("");
                    System.out.println("docno: " + docno);
                }
                //提取内容
                Pattern patternContent = Pattern.compile("<content>.*?</content>");
                Matcher matcherContent = patternContent.matcher(doc);

                if (matcherContent.find()) {
                    content = matcherContent.group();

                    Pattern patternTag = Pattern.compile("<.*?>");
                    Matcher matcherTag = patternTag.matcher(content);

                    content = matcherTag.replaceAll("");
                    System.out.println("content: " + content);
                }
                //创建文档
                Doc d = new Doc(url, docno, title, content);
                //分词
                d.seg();
                docList.add(d);
            }    // while
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * 计算文档集的逆向文档频率
     *
     */
    private static void idf() {
        for (Doc doc : docList) {//统计各个词项的文档频数
            for (String seg : doc.getSegMap().keySet()) {
                if (docSetDcMap.containsKey(seg)) {
                    docSetDcMap.put(seg, docSetDcMap.get(seg) + 1);
                } else {
                    docSetDcMap.put(seg, 1);
                }
            }
        }

        Set<String> segSet = docSetDcMap.keySet();
        //计算idf=log(size(doc set)/docs(d,w))
        for (String seg : segSet) {
            docSetIdfMap.put(seg, Math.log((double) docList.size() / (double) docSetDcMap.get(seg)));
        }
    }

    /**
     * 输出结果信息到控制台和目标文件
     *
     * @param dstPath 目标文件路径
     */
    private static void print(String dstPath) {
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(dstPath)));

            for (Doc doc : docList) {
                System.out.println(doc);
                bufferedWriter.write(doc.toString());
                bufferedWriter.newLine();

                for (String seg : doc.getSegMap().keySet()) {
                    double tf      = docSetTfMap.get(doc.getDocno()).get(seg);
                    double idf     = docSetIdfMap.get(seg);
                    String segInfo = seg + " tf: " + tf + " idf: " + idf + " tfidf: " + tf * idf;

                    System.out.println(segInfo);
                    bufferedWriter.write(segInfo);
                    bufferedWriter.newLine();
                }
            }

            bufferedWriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 计算文档集各文档中的词项频率
     * tf=count(w,d)/size(d)
     */
    private static void tf() {
        for (Doc doc : docList) {//各文档
            Map<String, Double> docTf = new HashMap<String, Double>();//单篇文档的词项频率

            for (String seg : doc.getSegMap().keySet()) {//文档中各词项
                docTf.put(seg, Double.valueOf((double) doc.getSegMap().get(seg) / (double) doc.getSegs().size()));
                //tf=count(w,d)/size(d)
            }

            docSetTfMap.put(doc.getDocno(), docTf);
        }
    }

    /**
     * 计算tfidf
     *
     * @param corpusPath 语料库文件路径
     * @param dstPath  目标文件路径
     */
    public static void tfidf(String corpusPath, String dstPath) {
        createDocSet(corpusPath);
        tf();
        idf();
        print(dstPath);
    }
}


//~ Formatted by Jindent --- http://www.jindent.com