python写简单的爬虫——爬虫陷阱（解决方法一：记录深度）

最新推荐文章于 2023-09-27 14:45:26 发布

hide_in_darkness

最新推荐文章于 2023-09-27 14:45:26 发布

阅读量363

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/hide_in_darkness/article/details/104517942

版权

python 专栏收录该内容

73 篇文章 0 订阅

订阅专栏

import urllib.request as ur
from urllib.error import URLError,ContentTooShortError,HTTPError
import re
from urllib.parse import urljoin
from urllib import robotparser
import time
from urllib.parse import urlparse
import itertools

class Throttle(object):
    def __init__(self,delay):
        self.delay=delay
        #保留上一次完成的时间的时间戳
        self.domains={}

    def wait(self,url):
        #result.scheme : 网络协议
        # result.netloc: 服务器位置（也有可能是用户信息）
        # result.path: 网页文件在服务器中的位置
        # result.params: 可选参数
        # result.query: &连接键值对
        # result.fragment:
        domain=urlparse(url).netloc
        last_successed=self.domains.get(domain)
        if self.delay>0 and last_successed is not None:
            sleep_secs=self.delay-(time.time()-last_successed)
            #判断时间是否符合延迟设定
            if sleep_secs>0:
                time.sleep(sleep_secs)
        self.domains[domain]=time.time()

def download(url, num_retries=2, user_agent='wswp',charset='utf-8',proxy=None):
    print('Downloading:',url)
    request=ur.Request(url)
    #添加请求头(一般添加'Cookies','User-Agent')
    #默认请求头wswp (Web Scrapying With Python)
    request.add_header('User-Agent',user_agent)
    #运用异常,try...except...处理遇到一些无法控制的错误的情况:
    try:
        if proxy:
            getProxy()
        resp=ur.urlopen(request)
        #headers.get_content_charset()得到请求Http响应返回的字符集
        cs=resp.headers.get_content_charset()
        #如果不存在，则采用默认的字符集utf-8
        if not cs:
            cs=charset
        #decode()表示根据某种编码表示
        html=resp.read().decode(cs)
    except (URLError,ContentTooShortError,HTTPError) as e:
        #e.reason 输出错误的原因
        print('Download error:',e.reason)
        html=None
        if num_retries>0:
            #hasattr(object,name)判断对象(objedt)是否包含对应属性(name)
            if hasattr(e,'code') and (500<=e.code<600):
                #一般地说,4XX错误都是发生在请求中的,5XX错误都是发生在服务器端的
                #重下载,排除由5XX引起的错误,设定重下载次数为num_retries
                return download(url,num_retries-1)
    return html


#获取代理IP
def getProxy():
    proxy_address = ur.urlopen("{代理IP网址}").read().decode('utf-8')
    proxy_handler = ur.ProxyHandler({'http',proxy_address})
    proxy_openner = ur.build_opener(proxy_handler)
    return proxy_openner
#注意ur.urlopen和ur.install_opener的区别
#install_opener是全局定义 以后的每一次ur.urlopen会默认调度install_opener

#解析robots.txt文件，以避免下载禁止爬取的URL
def get_robots_parser(robot_url):
    rp=robotparser.RobotFileParser()
    #set_url加载robots.txt文件
    rp.set_url(robot_url)
    rp.read()
    return rp


#防止爬虫陷阱 记录当前到达网页经过多少个链接，即记录深度
def link_crawlinks(start_url,link_regex,robots_url=None,user_agent='wswp',max_depth=5):
    crawl_queue=[start_url]
    #从根目录开始爬取，只要在其url末尾加入robots.txt.
    if not robots_url:
        robots_url='{}/robots.txt'.format(start_url)
    rp=get_robots_parser(robots_url)
    # #集合化crawl_queue，用于判断是否有重复元素
    # seen=set(crawl_queue)
    seen={}
    while crawl_queue:
        #设定每次的指定延迟时间
        throttle=Throttle(delay=0.5)
        #删除末尾的信息，并存储到url内
        url=crawl_queue.pop()
        #增加解析器，判断是否符合robts.txt
        if rp.can_fetch(user_agent,url):
            depth=seen.get(url,0)
            if depth==max_depth:
                print('Skipping %s due to depth'%url)
                continue
            #爬虫的延迟
            throttle.wait(url)
            html=download(url,user_agent=user_agent)
            if not html:
                continue
            for link in get_links(html):
            #出现符合linke_regrex形式的url,将他保存至crawl_queue
                if re.match(link_regex,link):
                    #urljoin(url1,url2)拼接两个地址
                    abs_link=urljoin(start_url,link)
                    if abs_link not in seen:
                        #避免重复访问同一个网站
                        seen[abs_link]=depth+1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:',url)

def get_links(html):
    #re.compile指定匹配的规; re.IGNORECASE忽略大小写; re.findall找到所有满足条件的信息
    webpage_regex=re.compile("""<a[^>]+href=["'](.*?)['"]""",re.IGNORECASE)
    return webpage_regex.findall(html)