方法一:(自带函数操作)
HanLP.extractSummary('正文', 10)
方法二:(自定义函数操作)
import nltk
import numpy
import jieba
import codecs
N = 100
CLUSTER_THRESHOLD = 5
TOP_SENTENCES = 5
def sent_tokenizer(texts):
start = 0
i = 0
sentences = []
punt_list = ".!?。!?,".encode('utf8').decode('utf8')
for text in texts:
if text in punt_list and token not in punt_list:
sentences.append(texts[start:i + 1])
start = i + 1
i += 1
else:
i += 1
token = list(texts[start:i + 2]).pop()
if start < len(texts):
sentences.append(texts[start:])
return sentences
def load_stopwordslist(path):
print('load stopwords...')
stoplist = [line.strip() for line in codecs.open(path, 'r', encoding='utf8').readlines()]
stopwrods = {}.fromkeys(stoplist)
return stopwrods
def summarize(text):
stopwords = load_stopwordslist('stopwords\\stop_words.txt')
sentences = sent_tokenizer(text)
words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in stopwords if len(w) > 1 and w != '\t']
wordfre = nltk.FreqDist(words)
topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:N]
scored_sentences = _score_sentences(sentences, topn_words)
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > (avg + 0.5 * std)]
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
def _score_sentences(sentences, topn_words):
scores = []
sentence_idx = -1
for s in [list(jieba.cut(s)) for s in sentences]:
sentence_idx += 1
word_idx = []
for w in topn_words:
try:
word_idx.append(s.index(w))
except ValueError:
pass
word_idx.sort()
if len(word_idx) == 0:
continue
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, max_cluster_score))
return scores
if __name__ == '__main__':
dict = summarize(u'目前我国正处于一个大数据时代,互联网技术的发展使得物联网技术快速发展,近年来更是直接在家层领域中产生较为直接的影响。人工智能技术的出现为等能家慎的发展提供了新的方向与内容。基于互联网技术的支持我国等能家居的发展较为迅速,对人们的居家生活产生较大影响。因此本文将针对大数据背景下智能家居的发展现状进行阐述,并对物联网技术在智能家居领域中的存在意义进行探索研究。 ,'
)
print('-----------approach 1-------------')
for sent in dict['top_n_summary']:
print(sent)
print('-----------approach 2-------------')
for sent in dict['mean_scored_summary']:
print(sent)
两个结果如下所示:
自带函数的输出结果:
手动编写函数的输出结果: