java 提取一篇文章的关键词(TF-IDF),结巴 分词的使用

这是结巴分词的源码,很遗憾,maven 中提供的版本最新只到1.0.2 ,而1.0.3才支持提取关键词

下边是结巴分词的源码仓库

https://github.com/huaban/jieba-analysis

我们自己的项目想要用结巴分词可以有几种方式

(1)下载最新结巴分词的源码,打包放入到自己的maven私服中,然后项目依赖就行

(2)依赖maven中低版本的jar,然后把新版本的功能手动加入自己的项目中

我采用第2种方法

一、加入依赖

<!-- https://mvnrepository.com/artifact/com.huaban/jieba-analysis -->
		<dependency>
			<groupId>com.huaban</groupId>
			<artifactId>jieba-analysis</artifactId>
			<version>1.0.2</version>
		</dependency>

加入词库文件,(从上边git库中的源码resources中可以拿到)

加入实体类

public class Keyword implements Comparable<Keyword> {
    /**
     * tfidfvalue
     */
    private double tfidfvalue;
    /**
     * name
     */
    private String name;

    public Keyword(String name, double tfidfvalue) {
        this.name = name;
        // tfidf值只保留3位小数
        this.tfidfvalue = (double) Math.round(tfidfvalue * 10000) / 10000;
    }

    public double getTfidfvalue() {
        return tfidfvalue;
    }

    public void setTfidfvalue(double tfidfvalue) {
        this.tfidfvalue = tfidfvalue;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    /**
     * 为了在返回tdidf分析结果时,可以按照值的从大到小顺序返回,故实现Comparable接口
     */
    @Override
    public int compareTo(Keyword o) {
        if (this.tfidfvalue - o.tfidfvalue > 0) {
            return -1;
        } else if (this.tfidfvalue - o.tfidfvalue < 0) {
            return 1;
        } else {
            return 0;
        }
        //return this.tfidfvalue-o.tfidfvalue>0?-1:1;
    }

    /**
     * 重写hashcode方法,计算方式与原生String的方法相同
     */
    @Override
    public int hashCode() {
        final int PRIME = 31;
        int result = 1;
        result = PRIME * result + ((name == null) ? 0 : name.hashCode());
        long temp;
        temp = Double.doubleToLongBits(tfidfvalue);
        result = PRIME * result + (int) (temp ^ (temp >>> 32));
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null) {
            return false;
        }
        if (getClass() != obj.getClass()) {
            return false;
        }
        Keyword other = (Keyword) obj;
        if (name == null) {
            if (other.name != null) {
                return false;
            }
        } else if (!name.equals(other.name)) {
            return false;
        }
        return true;
    }

}

加入分析类

@Slf4j
public class TfIdfAnalyzer {
    /**
     * idfMap集合
     */
    static HashMap<String, Double> idfMap;
    /**
     * 停止词集合
     */
    static HashSet<String> stopWordsSet;
    /**
     * idfMedian
     */
    static double idfMedian;

    /**
     *
     * 功能描述: tfidf分析方法
     *
     * @param content 需要分析的文本/文档内容
     * @param top 需要返回的tfidf值最高的N个关键词,若超过content本身含有的词语上限数目,则默认返回全部
     * @return 关键词
     * @auther: lizongke
     * @date: 2020/1/13 9:58
     */

    public List<Keyword> analyze(String content, int top) {
        List<Keyword> keywordList = new ArrayList<>();
        try {
            if (stopWordsSet == null) {
                stopWordsSet = new HashSet<>();
                ClassPathResource classPathResource = new ClassPathResource("dic/stop_words.txt");
                InputStream inputStream = classPathResource.getInputStream();
                loadStopWords(stopWordsSet, inputStream);
                //loadStopWords(stopWordsSet, this.getClass().getResourceAsStream("stop_words.txt"));
            }
            if (idfMap == null) {
                idfMap = new HashMap<>();
                loadIdfMap(idfMap, new ClassPathResource("dic/idf_dict.txt").getInputStream());
                //loadIDFMap(idfMap, this.getClass().getResourceAsStream("idf_dict.txt"));
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
        Map<String, Double> tfMap = getTf(content);
        for (String word : tfMap.keySet()) {
            // 若该词不在idf文档中,则使用平均的idf值(可能定期需要对新出现的网络词语进行纳入)
            if (idfMap.containsKey(word)) {
                keywordList.add(new Keyword(word, idfMap.get(word) * tfMap.get(word)));
            } else {
                keywordList.add(new Keyword(word, idfMedian * tfMap.get(word)));
            }

        }

        Collections.sort(keywordList);

        if (keywordList.size() > top) {
            int num = keywordList.size() - top;
            for (int i = 0; i < num; i++) {
                keywordList.remove(top);
            }
        }
        return keywordList;
    }

    /**
     *
     * 功能描述: tf值计算公式 tf=N(i,j)/(sum(N(k,j) for all k))
     * N(i,j)表示词语Ni在该文档d(content)中出现的频率,sum(N(k,j))代表所有词语在文档d中出现的频率之和
     * 
     * @param content 待分析文本
     * @return tf集合
     * @auther: lizongke
     * @date: 2020/1/13 10:09
     */
    private Map<String, Double> getTf(String content) {
        Map<String, Double> tfMap = new HashMap<>();
        if (content == null || "".equals(content)) {
            return tfMap;
        }
        JiebaSegmenter segmenter = new JiebaSegmenter();
        List<String> segments = segmenter.sentenceProcess(content);
        Map<String, Integer> freqMap = new HashMap<>();

        int wordSum = 0;
        for (String segment : segments) {
            //停用词不予考虑,单字词不予考虑
            if (!stopWordsSet.contains(segment) && segment.length() > 1) {
                wordSum++;
                if (freqMap.containsKey(segment)) {
                    freqMap.put(segment, freqMap.get(segment) + 1);
                } else {
                    freqMap.put(segment, 1);
                }
            }
        }

        // 计算double型的tf值
        for (String word : freqMap.keySet()) {
            tfMap.put(word, freqMap.get(word) * 0.1 / wordSum);
        }

        return tfMap;
    }

    /**
     * 默认jieba分词的停词表
     * url:https://github.com/yanyiwu/nodejieba/blob/master/dict/stop_words.utf8
     * 
     * @param set 停止词集合
     * @param in 停止词输入流
     */
    private void loadStopWords(Set<String> set, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                set.add(line.trim());
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

    /**
     * idf值本来需要语料库来自己按照公式进行计算,不过jieba分词已经提供了一份很好的idf字典,所以默认直接使用jieba分词的idf字典
     * url:https://raw.githubusercontent.com/yanyiwu/nodejieba/master/dict/idf.utf8
     *
     * @param map idf集合
     * @param in idf输入流
     */
    private void loadIdfMap(Map<String, Double> map, InputStream in) {
        BufferedReader bufr;
        try {
            bufr = new BufferedReader(new InputStreamReader(in));
            String line = null;
            while ((line = bufr.readLine()) != null) {
                String[] kv = line.trim().split(" ");
                map.put(kv[0], Double.parseDouble(kv[1]));
            }
            try {
                bufr.close();
            } catch (IOException e) {
                log.info(e.toString());
            }

            // 计算idf值的中位数
            List<Double> idfList = new ArrayList<>(map.values());
            Collections.sort(idfList);
            idfMedian = idfList.get(idfList.size() / 2);
        } catch (Exception e) {
            log.info(e.toString());
        }
    }

}

使用

 public List<String> keywords(String content) throws Exception {
        //去除空格和特殊字符
        String regEx = "[\n\r\t`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?^p]";
        String aa = "";
        content = content.replaceAll(regEx, aa).replaceAll(" ", "").replaceAll(" ", "");
        List<String> keywords = new ArrayList<>();
        //提取10个关键词
        TfIdfAnalyzer tfIdfAnalyzer = new TfIdfAnalyzer();
        int top = 10;
        List<Keyword> list = tfIdfAnalyzer.analyze(content, top);
        for (Keyword word : list) {
            keywords.add(word.getName());
        }
        return keywords;
    }

测试

 

  • 7
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值