《用python写网络爬虫》第一章Python 3重写代码-下载sitemap

最新推荐文章于 2022-07-31 08:54:48 发布

龙哥虎弟

最新推荐文章于 2022-07-31 08:54:48 发布

阅读量240

点赞数

分类专栏： python 技术人生文章标签： python 爬虫

本文链接：https://blog.csdn.net/tjzzy/article/details/87932176

版权

技术人生同时被 2 个专栏收录

99 篇文章 0 订阅

订阅专栏

python

36 篇文章 1 订阅

订阅专栏

《用python写网络爬虫》这本书本身还是不错，就是……使用了python 2作为基础，现在使用python 3重写。

2和3在网络调用库上差别挺大的。

代码文件实现功能：下载一个网站的sitemap，sitemap在crawler文件的main中调用，crawl_sitemap('http://example.webscraping.com/sitemap.xml')

common2负责提供函数下载和错误提供。有错误小于2次时，重试下载的机制。注意，错误码在500到600之间时，重试下载。有时网站已经拒绝服务了，出现429 Too many requests，就不再重试了。

代码文件1 sitemap_crawler_py3.py

# -*- coding: utf-8 -*-

import re
from common2 import download


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    print(sitemap)
    if sitemap == None:
        exit()
    print(type(sitemap))
    textSitemap = sitemap.read().decode('utf-8','ignore')
    print(textSitemap)

    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', textSitemap)
    print(type(links))
    print("共有%d个链接在这个sitemap中："%len(links))
    print(links)
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...处理html的代码


if __name__ == '__main__':
    #crawl_sitemap('http://172.27.19.23:8000/places/default/index/sitemap.xml')
    crawl_sitemap('http://example.webscraping.com/sitemap.xml')

'''hao123.com的sitemap：http://www.hao123.com/sitemap.xml
sohu：http://sitemap.sohu.com
http://sitemap.cn.yahoo.com
'''

代码文件2 common2.py

# -*- coding: utf-8 -*-

import requests
import urllib
from urllib.parse import urlparse


def download1(url):
    """Simple downloader"""
    return requests.urlopen(url).read()


def download2(url):
    """Download function that catches errors"""
    print('Downloading:', url)
    try:
        html = urllib.request.urlopen(url).read()
    except requests.URLError as e:
        print('Download error:', e.reason)
        html = None
    return html


def download3(url, num_retries=2):
    """Download function that also retries 5XX errors"""
    print('Downloading:', url)
    try:
        html = urllib.request.urlopen(url).read()
    except urllib.request.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download3(url, num_retries-1)
    return html


def download4(url, user_agent='wswp', num_retries=2,proxy=None):
    """Download function that includes user agent support"""
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = requests.Request(url, headers=headers)
    try:
        html = requests.urlopen(request).read()
    except requests.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download4(url, user_agent, num_retries-1)
    return html


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    """Download function with support for proxies"""
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(requests.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode( "utf-8","ignore")
    except urllib.request.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries-1)
    return html



if __name__ == '__main__':
    #print(download('http://172.27.19.23:8000/places/default/index')
    #print(download('http://exmaple.webscraping.com/places/default'))
    print(download('https://pinyin.sogou.com/'))