爬虫-链接深度

from urllib.request import urlopen
from urllib.error import URLError,HTTPError
import re
import time

#url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&sm=0&p=1'
#url = 'http://httpstat.us/500'
def download(url,retries_num=3):
    try:
        print('download... %s' % url)
        res = urlopen(url)
        html = res.read().decode('utf-8' )
    except HTTPError as e:
        print(e.code)
        html = None
        if retries_num > 0:
            print('[E]HTTPError!,retry times %d' % (4-retries_num))
            if hasattr(e,'code') and 500 <=e.code <=600:
                html = download(url,retries_num-1)
        else:
            print('[E]Failed!')
    except URLError as e:
        html = None
        print('[E]Unlocated URL!',url)

    return html

def get_links(html):
    webpage_regex = re.compile(r'<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html)

def link_crawler(home_url,link_regex,depth_regex=None):
    crawl_queue = [home_url]
    seen = set()
    while crawl_queue:
        url = crawl_queue.pop(0)
        time.sleep(0.5)
        html = download(url)
        if depth_regex and re.match(re.compile(depth_regex,re.IGNORECASE),url):
            continue
        for link in get_links(html):
            if re.match(re.compile(link_regex,re.IGNORECASE),link):
                if link not in seen:
                    crawl_queue.append(link)
                    seen.add(link)
    return seen

def main():
    home_url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E8%A5%BF%E5%AE%89&kw=python&p=1&isadv=0'
    link_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm|http://sou.zhaopin.com/jobs/searchresult.ashx\?jl=%e8%a5%bf%e5%ae%89&amp;kw=python'
    depth_regex = 'http://jobs.zhaopin.com/[\d]{15}.htm'
    res = link_crawler(home_url=home_url,link_regex=link_regex,depth_regex=depth_regex)                                          


main()  







  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值