python通过库自带函数和自己编写的函数-实现的一个中文文本摘要程序（两种方法）

最新推荐文章于 2022-12-21 11:11:20 发布

喜欢地上爬的孩子

最新推荐文章于 2022-12-21 11:11:20 发布

阅读量1.3k

点赞数

分类专栏： nlp python pyhanlp 文章标签： python 开发语言 nlp

本文链接：https://blog.csdn.net/thefreelittle/article/details/121342813

版权

python 同时被 3 个专栏收录

19 篇文章 4 订阅

订阅专栏

nlp

4 篇文章 0 订阅

订阅专栏

pyhanlp

1 篇文章 0 订阅

订阅专栏

方法一：（自带函数操作）

# 通过使用hanlp中的函数
HanLP.extractSummary('正文', 10)

方法二：（自定义函数操作）

# coding:utf-8

import nltk
import numpy
import jieba
import codecs

N = 100  # 单词数量
CLUSTER_THRESHOLD = 5  # 单词间的距离
TOP_SENTENCES = 5  # 返回的top n句子


# 摘要提取几句话-分句
def sent_tokenizer(texts):
    start = 0
    i = 0  # 每个字符的位置
    sentences = []
    punt_list = ".!?。！？，".encode('utf8').decode('utf8')  # ',.!?:;~，。！？：；～'.decode('utf8')
    for text in texts:
        if text in punt_list and token not in punt_list:  # 检查标点符号下一个字符是否还是标点
            sentences.append(texts[start:i + 1])  # 当前标点符号位置
            start = i + 1  # start标记到下一句的开头
            i += 1
        else:
            i += 1  # 若不是标点符号，则字符位置继续前移
            token = list(texts[start:i + 2]).pop()  # 取下一个字符
    if start < len(texts):
        sentences.append(texts[start:])  # 这是为了处理文本末尾没有标点符号的情况
    return sentences


# 摘要提取几句话-停用词
def load_stopwordslist(path):
    print('load stopwords...')
    stoplist = [line.strip() for line in codecs.open(path, 'r', encoding='utf8').readlines()]
    stopwrods = {}.fromkeys(stoplist)
    return stopwrods


# 摘要提取几句话-摘要
def summarize(text):
    stopwords = load_stopwordslist('stopwords\\stop_words.txt')
    sentences = sent_tokenizer(text)
    words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in stopwords if len(w) > 1 and w != '\t']
    wordfre = nltk.FreqDist(words)
    topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:N]
    scored_sentences = _score_sentences(sentences, topn_words)
    # approach 1,利用均值和标准差过滤非重要句子
    avg = numpy.mean([s[1] for s in scored_sentences])  # 均值
    std = numpy.std([s[1] for s in scored_sentences])  # 标准差
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > (avg + 0.5 * std)]
    # approach 2，返回top n句子
    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


# 摘要提取几句话-句子得分
def _score_sentences(sentences, topn_words):
    scores = []
    sentence_idx = -1
    for s in [list(jieba.cut(s)) for s in sentences]:
        sentence_idx += 1
        word_idx = []
        for w in topn_words:
            try:
                word_idx.append(s.index(w))  # 关键词出现在该句子中的索引位置
            except ValueError:  # w不在句子中
                pass
        word_idx.sort()
        if len(word_idx) == 0:
            continue
        # 对于两个连续的单词，利用单词位置索引，通过距离阀值计算族
        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)
        # 对每个族打分，每个族类的最大分数是对句子的打分
        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
            if score > max_cluster_score:
                max_cluster_score = score
        scores.append((sentence_idx, max_cluster_score))
    return scores


if __name__ == '__main__':
    dict = summarize(u'目前我国正处于一个大数据时代，互联网技术的发展使得物联网技术快速发展，近年来更是直接在家层领域中产生较为直接的影响。人工智能技术的出现为等能家慎的发展提供了新的方向与内容。基于互联网技术的支持我国等能家居的发展较为迅速，对人们的居家生活产生较大影响。因此本文将针对大数据背景下智能家居的发展现状进行阐述，并对物联网技术在智能家居领域中的存在意义进行探索研究。 ，'
                     )
    print('-----------approach 1-------------')
    for sent in dict['top_n_summary']:
        print(sent)
    print('-----------approach 2-------------')
    for sent in dict['mean_scored_summary']:
        print(sent)

两个结果如下所示：

自带函数的输出结果：

在这里插入图片描述

手动编写函数的输出结果：

在这里插入图片描述

喜欢地上爬的孩子

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
打赏
0
评论
python通过库自带函数和自己编写的函数-实现的一个中文文本摘要程序（两种方法）

方法一：（自带函数操作）# 通过使用hanlp中的函数HanLP.extractSummary('正文', 10)方法二：（自定义函数操作）# coding:utf-8import nltkimport numpyimport jiebaimport codecsN = 100 # 单词数量CLUSTER_THRESHOLD = 5 # 单词间的距离TOP_SENTENCES = 5 # 返回的top n句子# 摘要提取几句话-分句def sent_tokenize
复制链接

扫一扫