爬虫的东西

import requests
import re
import json
# import matplotlib.pyplot as plt
# import networkx as nx
import numpy as np

class PageRank:
    def __init__(self):
        self.url = 'https://www.hao123.com/'
        self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3602.2 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest'
            }
        self.pattern = re.compile(r'(?:href=")(https{0,1}://www\S*?\.com)')

    def get_inside_urls(self, url):
        '''
        功能:找出一个网页内部包含的链接
        :param url: 任意一个网站
        :return: 此网站内部的外链
        '''
        r = requests.get(url, headers=self.headers, timeout=5)
        html = r.text
        urls = re.findall(self.pattern, html)
        return list(set(urls))

    def on_start(self):
        '''
        找出从‘www.hao123.com’出发,最多3跳的所有网页,并存储在本地
        :return: 无
        '''
        dp = {}
        layer_url, run_time = [[self.url]], 0
        i = 1
        while run_time < 2:
            layer = layer_url.pop(0)
            layer_urls = set()
            while layer:
                url = layer.pop(0)
                try:
                    if url not in dp:
                        urls = self.get_inside_urls(url)
                        for url_sec in urls:
                            layer_urls.add(url_sec)
                            dp[url] = urls
                        print(f'done {i}')
                        i += 1
                except:
                    print('connect wrong')
            layer_url.append(list(layer_urls))
            run_time += 1
        with open('urls.json', 'w') as fp:
            json.dump(dp, fp)

    def pr_caculate(self):
        '''
        读取爬取的信息,并构造出dp矩阵
        :return: 无
        '''
        with open('urls.json', 'r') as fp:
            urls = json.load(fp)
        url_layer = set()
        for url in urls:
            url_layer.add(url)
            url_layer |= set(urls[url])
        num_urls = len(url_layer)
        print(f'there are {num_urls} urls')
        col_urls = list(url_layer)
        with open('index_urls.json', 'w') as fp:
            json.dump({'index_urls':col_urls}, fp)
        dp = [[0]*num_urls for i in range(num_urls)]
        for idx, url in enumerate(col_urls):
            to_urls = urls.get(url, 0)
            if to_urls:
                num_to_urls = len(to_urls)
                for ul in to_urls:
                    idx_ul = col_urls.index(ul)
                    dp[idx_ul][idx] = 1 / num_to_urls
            else:
                for i in range(num_urls):
                    dp[i][idx] = 1 / num_urls
        dp = {'data':dp}
        with open('dp.json', 'w') as fp:
            json.dump(dp, fp)

    def pr_caculate_inner(self, d=0.85):
        with open('dp.json', 'r') as fp:
            data = json.load(fp)
        dp = np.array(data['data'])
        N, E = dp.shape[0], np.ones_like(dp)
        print(dp.shape)
        A = d*dp + (1-d)/N * E
        epsilon, MSE = .001, 1
        pr_before = np.ones((N,1))
        while MSE > epsilon:
            pr_after = np.dot(A, pr_before).reshape(-1, 1)
            MSE = 1/N * sum((pr_before - pr_after)**2)
            pr_before = pr_after
        with open('pr.json', 'w') as fp:
            json.dump({'pr':pr_before.tolist()}, fp)

    def top_10pr_url(self):
        with open('pr.json', 'r') as fp:
            pr = json.load(fp)
        pr = pr['pr']
        for i in range(len(pr)):
            pr[i].append(i)
        #bubble 排序
        for i in range(10):
            for j in range(len(pr)-1):
                if pr[j][0] > pr[j+1][0]:
                    pr[j], pr[j+1] = pr[j+1], pr[j]
        top10 = pr[-10:]
        top10.reverse()
        print(top10)
        with open('index_urls.json', 'r') as fp:
            index_urls = json.load(fp)
        urls = index_urls['index_urls']
        for idx in top10:
            print(urls[idx[1]])
if __name__=='__main__':
    pagerank = PageRank()
    pagerank.on_start()  #爬取网站链接信息并存储
    # pagerank.url_connection_plot()
    pagerank.pr_caculate()  #计算转移矩阵S并存储
    pagerank.pr_caculate_inner()  #计算矩阵A并迭代计算PR值,存储PR值
    pagerank.top_10pr_url()  #排名前10网站及PR值

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值