import requests
import re
import json
# import matplotlib.pyplot as plt
# import networkx as nx
import numpy as np
class PageRank:
def __init__(self):
self.url = 'https://www.hao123.com/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3602.2 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
self.pattern = re.compile(r'(?:href=")(https{0,1}://www\S*?\.com)')
def get_inside_urls(self, url):
'''
功能:找出一个网页内部包含的链接
:param url: 任意一个网站
:return: 此网站内部的外链
'''
r = requests.get(url, headers=self.headers, timeout=5)
html = r.text
urls = re.findall(self.pattern, html)
return list(set(urls))
def on_start(self):
'''
找出从‘www.hao123.com’出发,最多3跳的所有网页,并存储在本地
:return: 无
'''
dp = {}
layer_url, run_time = [[self.url]], 0
i = 1
while run_time < 2:
layer = layer_url.pop(0)
layer_urls = set()
while layer:
url = layer.pop(0)
try:
if url not in dp:
urls = self.get_inside_urls(url)
for url_sec in urls:
layer_urls.add(url_sec)
dp[url] = urls
print(f'done {i}')
i += 1
except:
print('connect wrong')
layer_url.append(list(layer_urls))
run_time += 1
with open('urls.json', 'w') as fp:
json.dump(dp, fp)
def pr_caculate(self):
'''
读取爬取的信息,并构造出dp矩阵
:return: 无
'''
with open('urls.json', 'r') as fp:
urls = json.load(fp)
url_layer = set()
for url in urls:
url_layer.add(url)
url_layer |= set(urls[url])
num_urls = len(url_layer)
print(f'there are {num_urls} urls')
col_urls = list(url_layer)
with open('index_urls.json', 'w') as fp:
json.dump({'index_urls':col_urls}, fp)
dp = [[0]*num_urls for i in range(num_urls)]
for idx, url in enumerate(col_urls):
to_urls = urls.get(url, 0)
if to_urls:
num_to_urls = len(to_urls)
for ul in to_urls:
idx_ul = col_urls.index(ul)
dp[idx_ul][idx] = 1 / num_to_urls
else:
for i in range(num_urls):
dp[i][idx] = 1 / num_urls
dp = {'data':dp}
with open('dp.json', 'w') as fp:
json.dump(dp, fp)
def pr_caculate_inner(self, d=0.85):
with open('dp.json', 'r') as fp:
data = json.load(fp)
dp = np.array(data['data'])
N, E = dp.shape[0], np.ones_like(dp)
print(dp.shape)
A = d*dp + (1-d)/N * E
epsilon, MSE = .001, 1
pr_before = np.ones((N,1))
while MSE > epsilon:
pr_after = np.dot(A, pr_before).reshape(-1, 1)
MSE = 1/N * sum((pr_before - pr_after)**2)
pr_before = pr_after
with open('pr.json', 'w') as fp:
json.dump({'pr':pr_before.tolist()}, fp)
def top_10pr_url(self):
with open('pr.json', 'r') as fp:
pr = json.load(fp)
pr = pr['pr']
for i in range(len(pr)):
pr[i].append(i)
#bubble 排序
for i in range(10):
for j in range(len(pr)-1):
if pr[j][0] > pr[j+1][0]:
pr[j], pr[j+1] = pr[j+1], pr[j]
top10 = pr[-10:]
top10.reverse()
print(top10)
with open('index_urls.json', 'r') as fp:
index_urls = json.load(fp)
urls = index_urls['index_urls']
for idx in top10:
print(urls[idx[1]])
if __name__=='__main__':
pagerank = PageRank()
pagerank.on_start() #爬取网站链接信息并存储
# pagerank.url_connection_plot()
pagerank.pr_caculate() #计算转移矩阵S并存储
pagerank.pr_caculate_inner() #计算矩阵A并迭代计算PR值,存储PR值
pagerank.top_10pr_url() #排名前10网站及PR值