python新闻编写_百度AI新闻摘要python可以怎么写

最新推荐文章于 2023-05-02 22:09:07 发布

weixin_39700397

最新推荐文章于 2023-05-02 22:09:07 发布

阅读量206

点赞数

文章标签： python新闻编写

本文链接：https://blog.csdn.net/weixin_39700397/article/details/111431002

版权

展开全部

完整代码from collections import OrderedDict

import numpy as np

import spacy

from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():

"""Extract keywords from text"""

def __init__(self):

self.d = 0.85 # damping coefficient, usually is .85

self.min_diff = 1e-5 # convergence threshold

self.steps = 10 # iteration steps

self.node_weight = None # save keywords and its weight

def set_stopwords(self, stopwords):

"""Set stop words"""

for word in STOP_WORDS.union(set(stopwords)):

lexeme = nlp.vocab[word]

lexeme.is_stop = True

def sentence_segment(self, doc, candidate_pos, lower):

"""Store those words only in cadidate_pos"""

sentences = []

for sent in doc.sents:

selected_words = []

for token in sent:

# Store words only with cadidate POS tag

if token.pos_ in candidate_pos and token.is_stop is False:

if lower is True:

selected_words.append(token.text.lower())

else:

selected_words.append(token.text)

sentences.append(selected_words)

return sentences

def get_vocab(self, sentences):

"""Get all tokens"""

vocab = OrderedDict()

i = 0

for sentence in sentences:

for word in sentence:

if word not in vocab:

vocab[word] = i

i += 1

return vocab

def get_token_pairs(self, window_size, sentences):

"""Build token_pairs from windows in sentences"""

token_pairs = list()

for sentence in sentences:

for i, word in enumerate(sentence):

for j in range(i+1, i+window_size):

if j >= len(sentence):

break

pair = (word, sentence[j])

if pair not in token_pairs:

token_pairs.append(pair)

return token_pairs

def symmetrize(self, a):

return a + a.T - np.diag(a.diagonal())

def get_matrix(self, vocab, token_pairs):

"""Get normalized matrix"""

# Build matrix

vocab_size = len(vocab)

g = np.zeros((vocab_size, vocab_size), dtype='float')

for word1, word2 in token_pairs:

i, j = vocab[word1], vocab[word2]

g[i][j] = 1

# Get Symmeric matrix

g = self.symmetrize(g)

# Normalize matrix by column

norm = np.sum(g, axis=0)

g_norm = np.divide(g, norm, where=norm!62616964757a686964616fe78988e69d8331333431353962=0) # this is ignore the 0 element in norm

return g_norm

def get_keywords(self, number=10):

"""Print top number keywords"""

node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))

for i, (key, value) in enumerate(node_weight.items()):

print(key + ' - ' + str(value))

if i > number:

break

def analyze(self, text,

candidate_pos=['NOUN', 'PROPN'],

window_size=4, lower=False, stopwords=list()):

"""Main function to analyze text"""

# Set stop words

self.set_stopwords(stopwords)

# Pare text by spaCy

doc = nlp(text)

# Filter sentences

sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words

# Build vocabulary

vocab = self.get_vocab(sentences)

# Get token_pairs from windows

token_pairs = self.get_token_pairs(window_size, sentences)

# Get normalized matrix

g = self.get_matrix(vocab, token_pairs)

# Initionlization for weight(pagerank value)

pr = np.array([1] * len(vocab))

# Iteration

previous_pr = 0

for epoch in range(self.steps):

pr = (1-self.d) + self.d * np.dot(g, pr)

if abs(previous_pr - sum(pr))

break

else:

previous_pr = sum(pr)

# Get weight for each node

node_weight = dict()

for word, index in vocab.items():

node_weight[word] = pr[index]

self.node_weight = node_weight

weixin_39700397

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python新闻编写_百度AI新闻摘要python可以怎么写

展开全部完整代码fromcollectionsimportOrderedDictimportnumpyasnpimportspacyfromspacy.lang.en.stop_wordsimportSTOP_WORDSnlp=spacy.load('en_core_web_sm')classTextRank4Keyword():"""Extractkeywordsf...
复制链接

扫一扫