《用python写网络爬虫》这本书本身还是不错,就是……使用了python 2作为基础,现在使用python 3重写。
2和3在网络调用库上差别挺大的。
代码文件实现功能:下载一个网站的sitemap,sitemap在crawler文件的main中调用,crawl_sitemap('http://example.webscraping.com/sitemap.xml')
common2负责提供函数下载和错误提供。有错误小于2次时,重试下载的机制。注意,错误码在500到600之间时,重试下载。有时网站已经拒绝服务了,出现429 Too many requests,就不再重试了。
代码文件1 sitemap_crawler_py3.py
# -*- coding: utf-8 -*-
import re
from common2 import download
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
print(sitemap)
if sitemap == None:
exit()
print(type(sitemap))
textSitemap = sitemap.read().decode('utf-8','ignore')
print(textSitemap)
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>', textSitemap)
print(type(links))
print("共有%d个链接在这个sitemap中:"%len(links))
print(links)
# download each link
for link in links:
html = download(link)
# scrape html here
# ...处理html的代码
if __name__ == '__main__':
#crawl_sitemap('http://172.27.19.23:8000/places/default/index/sitemap.xml')
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
'''hao123.com的sitemap:http://www.hao123.com/sitemap.xml
sohu:http://sitemap.sohu.com
http://sitemap.cn.yahoo.com
'''
代码文件2 common2.py
# -*- coding: utf-8 -*-
import requests
import urllib
from urllib.parse import urlparse
def download1(url):
"""Simple downloader"""
return requests.urlopen(url).read()
def download2(url):
"""Download function that catches errors"""
print('Downloading:', url)
try:
html = urllib.request.urlopen(url).read()
except requests.URLError as e:
print('Download error:', e.reason)
html = None
return html
def download3(url, num_retries=2):
"""Download function that also retries 5XX errors"""
print('Downloading:', url)
try:
html = urllib.request.urlopen(url).read()
except urllib.request.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# retry 5XX HTTP errors
html = download3(url, num_retries-1)
return html
def download4(url, user_agent='wswp', num_retries=2,proxy=None):
"""Download function that includes user agent support"""
print('Downloading:', url)
headers = {'User-agent': user_agent}
request = requests.Request(url, headers=headers)
try:
html = requests.urlopen(request).read()
except requests.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# retry 5XX HTTP errors
html = download4(url, user_agent, num_retries-1)
return html
def download(url, user_agent='wswp', proxy=None, num_retries=2):
"""Download function with support for proxies"""
print('Downloading:', url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(requests.ProxyHandler(proxy_params))
try:
html = opener.open(request).read().decode( "utf-8","ignore")
except urllib.request.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# retry 5XX HTTP errors
html = download(url, user_agent, proxy, num_retries-1)
return html
if __name__ == '__main__':
#print(download('http://172.27.19.23:8000/places/default/index')
#print(download('http://exmaple.webscraping.com/places/default'))
print(download('https://pinyin.sogou.com/'))