pagerank(textrank)

from numpy import *
import jieba
import copy

with open('content', 'r', encoding = 'utf-8') as f:
content = f.read()

with open('stopwords', 'r', encoding = 'utf-8') as f:
sw = f.readlines()

sw = map(lambda x: x.replace('\n', ''), sw)#停词

word_list = jieba.cut(content)

word_list = list(word_list)

word_list_vice = copy.deepcopy(word_list)#备份

set_sw = set(sw)

for word in word_list:#迭代消耗
if word in set_sw:
word_list_vice.remove(word)

word_list = copy.deepcopy(word_list_vice)#备份

word_set_list = list(set(word_list_vice))#转换类型消耗

aa = zeros((len(word_set_list), len(word_set_list)), dtype = float)

word_list_vice = copy.deepcopy(list(word_list))#备份

for i, word in enumerate(word_list):#迭代消耗
if i == len(word_list_vice) - 1:
break
sindex = word_set_list.index(word)
eindex = word_set_list.index(word_list_vice[i + 1])
aa[sindex][eindex] = aa[sindex][eindex] + 1
aa[eindex][sindex] = aa[eindex][sindex] + 1

def graph_init(a):#初始化转移矩阵
row, column = a.shape
c = zeros((row, column),dtype = float)
for i in range(row):
for j in range(column):
c[i][j] = a[i][j] / (sum(a[i]))
return c

def pr_init(c):#pageRank向量初始化
row, column = c.shape
pr = zeros(row,dtype = float)
for i in range(row):
pr[i] = float(1) / row
return pr

def pageRank(s, pr, p):#迭代计算pageRank向量
while(array_equal(pr, p * dot(pr, s) + (1 - p) * pr) == False):#判断pr矩阵是否收敛
pr = p * dot(pr, s) + (1 - p) * pr
return pr

if __name__=="__main__":
s = graph_init(aa)
pr = pr_init(s)
p = 0.8 #浏览当前网页的概率p
r = pageRank(s, pr, p)

result_list = []
for i, j in zip(word_set_list, r):
result_dict = {}
result_dict['word'] = i
result_dict['score'] = j
result_list.append(result_dict)

rl = sorted(result_list, key = lambda x: x['score'])

转载于:https://www.cnblogs.com/kayy/p/7976705.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值