java 提取一篇文章的关键词（TF-IDF），结巴分词的使用

最新推荐文章于 2024-08-08 08:02:25 发布

安然无Yangon

最新推荐文章于 2024-08-08 08:02:25 发布

阅读量4.7k

点赞数 7

分类专栏： java

本文链接：https://blog.csdn.net/qq_36635434/article/details/103955687

版权

java 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

这是结巴分词的源码,很遗憾，maven 中提供的版本最新只到1.0.2 ，而1.0.3才支持提取关键词

下边是结巴分词的源码仓库

https://github.com/huaban/jieba-analysis

我们自己的项目想要用结巴分词可以有几种方式

（1）下载最新结巴分词的源码，打包放入到自己的maven私服中，然后项目依赖就行

（2）依赖maven中低版本的jar，然后把新版本的功能手动加入自己的项目中

我采用第2种方法

一、加入依赖

<!-- https://mvnrepository.com/artifact/com.huaban/jieba-analysis -->
		<dependency>
			<groupId>com.huaban</groupId>
			<artifactId>jieba-analysis</artifactId>
			<version>1.0.2</version>
		</dependency>

加入词库文件，（从上边git库中的源码resources中可以拿到）

加入实体类

public class Keyword implements Comparable<Keyword> {
    /**
     * tfidfvalue
     */
    private double tfidfvalue;
    /**
     * name
     */
    private String name;

    public Keyword(String name, double tfidfvalue) {
        this.name = name;
        // tfidf值只保留3位小数
        this.tfidfvalue = (double) Math.round(tfidfvalue * 10000) / 10000;
    }

    public double getTfidfvalue() {
        return tfidfvalue;
    }

    public void setTfidfvalue(double tfidfvalue) {
        this.tfidfvalue = tfidfvalue;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    /**
     * 为了在返回tdidf分析结果时，可以按照值的从大到小顺序返回，故实现Comparable接口
     */
    @Override
    public int compareTo(Keyword o) {
        if (this.tfidfvalue - o.tfidfvalue > 0) {
            return -1;
        } else if (this.tfidfvalue - o.tfidfvalue < 0) {
            return 1;
        } else {
            return 0;
        }
        //return this.tfidfvalue-o.tfidfvalue>0?-1:1;
    }

    /**
     * 重写hashcode方法，计算方式与原生String的方法相同
     */
    @Override
    public int hashCode() {
        final int PRIME = 31;
        int result = 1;
        result = PRIME * result + ((name == null) ? 0 : name.hashCode());
        long temp;
        temp = Double.doubleToLongBits(tfidfvalue);
        result = PRIME * result + (int) (temp ^ (temp >>> 32));
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        Keyword other = (Keyword) obj;
        if (name == null) {
            if (other.name != null) {
                return false;
            }
        } else if (!name.equals(other.name)) {
            return false;
        }
        return true;
    }

}

加入分析类

@Slf4j
public class TfIdfAnalyzer {
    /**
     * idfMap集合
     */
    static HashMap<String, Double> idfMap;
    /**
     * 停止词集合
     */
    static HashSet<String> stopWordsSet;
    /**
     * idfMedian
     */
    static double idfMedian;

    /**
     *
     * 功能描述: tfidf分析方法
     *
     * @param content 需要分析的文本/文档内容
     * @param top 需要返回的tfidf值最高的N个关键词，若超过content本身含有的词语上限数目，则默认返回全部
     * @return 关键词
     * @auther: lizongke
     * @date: 2020/1/13 9:58
     */

    public List<Keyword> analyze(String content, int top) {
        List<Keyword> keywordList = new ArrayList<>();
        try {
            if (stopWordsSet == null) {
                stopWordsSet = new HashSet<>();
                ClassPathResource classPathResource = new ClassPathResource("dic/stop_words.txt");
                InputStream inputStream = classPathResource.getInputStream();
                loadStopWords(stopWordsSet, inputStream);
                //loadStopWords(stopWordsSet, this.getClass().getResourceAsStream("stop_words.txt"));
            }
            if (idfMap == null) {
                idfMap = new HashMap<>();
                loadIdfMap(idfMap, new ClassPathResource("dic/idf_dict.txt").getInputStream());
                //loadIDFMap(idfMap, this.getClass().getResourceAsStream("idf_dict.txt"));
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
        Map<String, Double> tfMap = getTf(content);
        for (String word : tfMap.keySet()) {
            // 若该词不在idf文档中，则使用平均的idf值(可能定期需要对新出现的网络词语进行纳入)
            if (idfMap.containsKey(word)) {
                keywordList.add(new Keyword(word, idfMap.get(word) * tfMap.get(word)));
            } else {
                keywordList.add(new Keyword(word, idfMedian * tfMap.get(word)));
            }

        }

        Collections.sort(keywordList);

        if (keywordList.size() > top) {
            int num = keywordList.size() - top;
            for (int i = 0; i < num; i++) {
                keywordList.remove(top);
            }
        }
        return keywordList;
    }

    /**
     *
     * 功能描述: tf值计算公式 tf=N(i,j)/(sum(N(k,j) for all k))
     * N(i,j)表示词语Ni在该文档d（content）中出现的频率，sum(N(k,j))代表所有词语在文档d中出现的频率之和
     * 
     * @param content 待分析文本
     * @return tf集合
     * @auther: lizongke
     * @date: 2020/1/13 10:09
     */
    private Map<String, Double> getTf(String content) {
        Map<String, Double> tfMap = new HashMap<>();
        if (content == null || "".equals(content)) {
            return tfMap;
        }
        JiebaSegmenter segmenter = new JiebaSegmenter();
        List<String> segments = segmenter.sentenceProcess(content);
        Map<String, Integer> freqMap = new HashMap<>();

        int wordSum = 0;
        for (String segment : segments) {
            //停用词不予考虑，单字词不予考虑
            if (!stopWordsSet.contains(segment) && segment.length() > 1) {
                wordSum++;
                if (freqMap.containsKey(segment)) {
                    freqMap.put(segment, freqMap.get(segment) + 1);
                } else {
                    freqMap.put(segment, 1);
                }
            }
        }

        // 计算double型的tf值
        for (String word : freqMap.keySet()) {
            tfMap.put(word, freqMap.get(word) * 0.1 / wordSum);
        }

        return tfMap;
    }

    /**
     * 默认jieba分词的停词表
     * url:https://github.com/yanyiwu/nodejieba/blob/master/dict/stop_words.utf8
     * 
     * @param set 停止词集合
     * @param in 停止词输入流
     */
    private void loadStopWords(Set<String> set, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                set.add(line.trim());
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

    /**
     * idf值本来需要语料库来自己按照公式进行计算，不过jieba分词已经提供了一份很好的idf字典，所以默认直接使用jieba分词的idf字典
     * url:https://raw.githubusercontent.com/yanyiwu/nodejieba/master/dict/idf.utf8
     *
     * @param map idf集合
     * @param in idf输入流
     */
    private void loadIdfMap(Map<String, Double> map, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                String[] kv = line.trim().split(" ");
                map.put(kv[0], Double.parseDouble(kv[1]));
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }

            // 计算idf值的中位数
            List<Double> idfList = new ArrayList<>(map.values());
            Collections.sort(idfList);
            idfMedian = idfList.get(idfList.size() / 2);
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

}

使用

 public List<String> keywords(String content) throws Exception {
        //去除空格和特殊字符
        String regEx = "[\n\r\t`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？^p]";
        String aa = "";
        content = content.replaceAll(regEx, aa).replaceAll("　", "").replaceAll(" ", "");
        List<String> keywords = new ArrayList<>();
        //提取10个关键词
        TfIdfAnalyzer tfIdfAnalyzer = new TfIdfAnalyzer();
        int top = 10;
        List<Keyword> list = tfIdfAnalyzer.analyze(content, top);
        for (Keyword word : list) {
            keywords.add(word.getName());
        }
        return keywords;
    }

测试

安然无Yangon

关注

7
点赞
踩
23

收藏

觉得还不错? 一键收藏
1
评论
java 提取一篇文章的关键词（TF-IDF），结巴分词的使用

这是结巴分词的源码,很遗憾，maven 中提供的版本最新只到1.0.2 ，而1.0.3才支持提取关键词下边是结巴分词的源码仓库https://github.com/huaban/jieba-analysis我们自己的项目想要用结巴分词可以有几种方式（1）下载最新结巴分词的源码，打包放入到自己的maven私服中，然后项目依赖就行（2）依赖maven中低版本的jar，然后把新版...
复制链接

扫一扫