java结巴分词器实现文章段落关键词生成提取

西米先生「软件工程师」

已于 2023-10-25 17:47:25 修改

阅读量302

点赞数

文章标签： java 开发语言

于 2023-10-25 17:45:57 首次发布

本文链接：https://blog.csdn.net/sulei627719296/article/details/134040136

版权

一、pom.xml文件

<!--中文分词-->
        <dependency>
            <groupId>com.huaban</groupId>
            <artifactId>jieba-analysis</artifactId>
            <version>1.0.2</version>
        </dependency>

二、Dic字典文件

※关注微信公众号：搜题网

※回复关键词：dic

※免费获取分词字典文件

三、DTO类文件

package com.sulei.dto;

/**
 * @Description:
 * @Author sulei
 * @Date 2021/12/16 16:02
 */
public class Keyword implements Comparable<Keyword>{
    /**
     * tfidfvalue
     */
    private double tfidfvalue;
    /**
     * name
     */
    private String name;

    public Keyword(String name, double tfidfvalue) {
        this.name = name;
        // tfidf值只保留3位小数
        this.tfidfvalue = (double) Math.round(tfidfvalue * 10000) / 10000;
    }

    public double getTfidfvalue() {
        return tfidfvalue;
    }

    public void setTfidfvalue(double tfidfvalue) {
        this.tfidfvalue = tfidfvalue;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    /**
     * 为了在返回tdidf分析结果时，可以按照值的从大到小顺序返回，故实现Comparable接口
     */
    @Override
    public int compareTo(Keyword o) {
        if (this.tfidfvalue - o.tfidfvalue > 0) {
            return -1;
        } else if (this.tfidfvalue - o.tfidfvalue < 0) {
            return 1;
        } else {
            return 0;
        }
        //return this.tfidfvalue-o.tfidfvalue>0?-1:1;
    }

    /**
     * 重写hashcode方法，计算方式与原生String的方法相同
     */
    @Override
    public int hashCode() {
        final int PRIME = 31;
        int result = 1;
        result = PRIME * result + ((name == null) ? 0 : name.hashCode());
        long temp;
        temp = Double.doubleToLongBits(tfidfvalue);
        result = PRIME * result + (int) (temp ^ (temp >>> 32));
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        Keyword other = (Keyword) obj;
        if (name == null) {
            if (other.name != null) {
                return false;
            }
        } else if (!name.equals(other.name)) {
            return false;
        }
        return true;
    }


}

package com.sulei.dto;

import com.huaban.analysis.jieba.JiebaSegmenter;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.io.ClassPathResource;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;

/**
 * @Description:
 * @Author sulei
 * @Date 2021/12/16 16:03
 */
@Slf4j
public class TfIdfAnalyzer {
    /**
     * idfMap集合
     */
    static HashMap<String, Double> idfMap;
    /**
     * 停止词集合
     */
    static HashSet<String> stopWordsSet;
    /**
     * idfMedian
     */
    static double idfMedian;

    /**
     *
     * 功能描述: tfidf分析方法
     *
     * @param content 需要分析的文本/文档内容
     * @param top 需要返回的tfidf值最高的N个关键词，若超过content本身含有的词语上限数目，则默认返回全部
     * @return 关键词
     * @auther: lizongke
     * @date: 2020/1/13 9:58
     */

    public List<Keyword> analyze(String content, int top) {
        List<Keyword> keywordList = new ArrayList<>();
        try {
            if (stopWordsSet == null) {
                stopWordsSet = new HashSet<>();
                ClassPathResource classPathResource = new ClassPathResource("dic/stop_words.txt");
                InputStream inputStream = classPathResource.getInputStream();
                loadStopWords(stopWordsSet, inputStream);
                //loadStopWords(stopWordsSet, this.getClass().getResourceAsStream("stop_words.txt"));
            }
            if (idfMap == null) {
                idfMap = new HashMap<>();
                loadIdfMap(idfMap, new ClassPathResource("dic/idf_dict.txt").getInputStream());
                //loadIDFMap(idfMap, this.getClass().getResourceAsStream("idf_dict.txt"));
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
        Map<String, Double> tfMap = getTf(content);
        for (String word : tfMap.keySet()) {
            // 若该词不在idf文档中，则使用平均的idf值(可能定期需要对新出现的网络词语进行纳入)
            if (idfMap.containsKey(word)) {
                keywordList.add(new Keyword(word, idfMap.get(word) * tfMap.get(word)));
            } else {
                keywordList.add(new Keyword(word, idfMedian * tfMap.get(word)));
            }

        }

        Collections.sort(keywordList);

        if (keywordList.size() > top) {
            int num = keywordList.size() - top;
            for (int i = 0; i < num; i++) {
                keywordList.remove(top);
            }
        }
        return keywordList;
    }

    /**
     *
     * 功能描述: tf值计算公式 tf=N(i,j)/(sum(N(k,j) for all k))
     * N(i,j)表示词语Ni在该文档d（content）中出现的频率，sum(N(k,j))代表所有词语在文档d中出现的频率之和
     *
     * @param content 待分析文本
     * @return tf集合
     * @auther: lizongke
     * @date: 2020/1/13 10:09
     */
    private Map<String, Double> getTf(String content) {
        Map<String, Double> tfMap = new HashMap<>();
        if (content == null || "".equals(content)) {
            return tfMap;
        }
        JiebaSegmenter segmenter = new JiebaSegmenter();
        List<String> segments = segmenter.sentenceProcess(content);
        Map<String, Integer> freqMap = new HashMap<>();

        int wordSum = 0;
        for (String segment : segments) {
            //停用词不予考虑，单字词不予考虑
            if (!stopWordsSet.contains(segment) && segment.length() > 1) {
                wordSum++;
                if (freqMap.containsKey(segment)) {
                    freqMap.put(segment, freqMap.get(segment) + 1);
                } else {
                    freqMap.put(segment, 1);
                }
            }
        }

        // 计算double型的tf值
        for (String word : freqMap.keySet()) {
            tfMap.put(word, freqMap.get(word) * 0.1 / wordSum);
        }

        return tfMap;
    }

    /**
     * 默认jieba分词的停词表
     * url:https://github.com/yanyiwu/nodejieba/blob/master/dict/stop_words.utf8
     *
     * @param set 停止词集合
     * @param in 停止词输入流
     */
    private void loadStopWords(Set<String> set, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                set.add(line.trim());
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

    /**
     * idf值本来需要语料库来自己按照公式进行计算，不过jieba分词已经提供了一份很好的idf字典，所以默认直接使用jieba分词的idf字典
     * url:https://raw.githubusercontent.com/yanyiwu/nodejieba/master/dict/idf.utf8
     *
     * @param map idf集合
     * @param in idf输入流
     */
    private void loadIdfMap(Map<String, Double> map, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                String[] kv = line.trim().split(" ");
                map.put(kv[0], Double.parseDouble(kv[1]));
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }

            // 计算idf值的中位数
            List<Double> idfList = new ArrayList<>(map.values());
            Collections.sort(idfList);
            idfMedian = idfList.get(idfList.size() / 2);
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

}

四、接口类、实现类

package com.sulei.service;
import java.util.List;
public interface FenciService {

    List<String> keywords(String content,int top) throws Exception;
}

package com.sulei.service.Impl;

import com.sulei.dto.Keyword;
import com.sulei.dto.TfIdfAnalyzer;
import com.sulei.service.FenciService;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.List;

/**
 * @Description:
 * @Author sulei
 * @Date 2021/12/16 16:08
 */
@Service
public class FenciServiceImpl implements FenciService {

    @Override
    public List<String> keywords(String content,int top) throws Exception {

        //去除空格和特殊字符
        String regEx = "[\n\r\t`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？^p\\\"\\\"-_]";
        String aa = "";
        content = content.replaceAll(regEx, aa).replaceAll("　", "").replaceAll(" ", "");
        List<String> keywords = new ArrayList<>();
        //提取10个关键词
        TfIdfAnalyzer tfIdfAnalyzer = new TfIdfAnalyzer();
        List<Keyword> list = tfIdfAnalyzer.analyze(content, top);
        for (Keyword word : list) {
            keywords.add(word.getName());
        }
        return keywords;
    }

}

五、controller使用

 @Autowired
 private FenciService fenciService;



List<String> keywords = fenciService.keywords("哈哈无花果翠云草酢浆草是什么，。我是帅哥666无花果真好吃还有北沙参穿心莲翠云草，草豆蔻和蝉蜕酢浆草也不错的", 20);
StringBuilder stringBuilder=new StringBuilder();
for(String s:keywords){
    stringBuilder.append(s).append("、");
}