1.1 下载网页
下面的函数都需要导入
urllib2
import urllib2
1.1.1 下载网页–
使用
urllib2
模块下载URL
# 下载网页 version:0.1
def download1(url):
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
return html
1.1.2 下载网页–添加重试功能
下载时遇到的错误经常是临时的,比如服务器过载时返回的
503 Service Unavailable
错误。对此类错误,我们可以重试下载。
# 下载网页 version:0.2
# num_retries:重试次数
def download2(url, num_retries=2):
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
if num_retries > 0:
# 4xx错误发生在请求存在问题时
# 5xx错误发生在服务端存在问题时
if hasattr(e, 'code') and 500 <= e.code < 600:
# 递归 重试
return download2(url, num_retries-1)
return html
1.1.3 下载网页–添加设置用户代理功能
默认情况下,
urllib2
使用Python-urllib/2.7
作为用户代理下载网页内容。如果能使用可辨别的用户代理则更好,使用默认代理可能被一些网站封禁,质量不佳的Python网络爬虫也可能造成服务器过载。
# 下载网页 version:0.3
# num_retries:重试次数
# user_agent:用户代理
def download3(url, user_agent='daimx', num_retries=2):
print 'Downloading:',url
headers = {'User-agent': user_agent}
request = urllib2.Request(url, headers=headers)
try:
html = urllib2.urlopen(request).read()
except urllib2.URLError as e:
print 'Download error:',e.reason
html = None
if num_retries > 0:
# 4xx错误发生在请求存在问题时
# 5xx错误发生在服务端存在问题时
if hasattr(e, 'code') and 500 <= e.code < 600:
# 递归 重试
return download3(url, user_agent, num_retries-1)
return html
1.2 不同类型的爬虫
1.2.1 网站地图爬虫
用简单的正则表达式,从地图文件sitemap中的<loc>标签中提取出url
import re
def crawl_sitemap(url):
# 下载地图文件 url为网站地图文件
sitemap = download3(url)
print sitemap
# 从地图文件中读取地址集合
links = re.findall('<loc>(.*?)</loc>', sitemap)
for link in links:
html = download3(link)
crawl_sitemap('https://www.shiyanlou.com/sitemap.xml')
1.2.2 ID遍历爬虫
在访问实验楼的过程中发现url有一定的规律。如:
https://www.shiyanlou.com/courses/1
https://www.shiyanlou.com/courses/2
https://www.shiyanlou.com/courses/3
………
我们尝试用遍历ID来爬取所有课程页
import itertools
def iteration2(max_errors=5):
num_errors = 0
for page in itertools.count(1):
url = 'https://www.shiyanlou.com/courses/596/labs/%d/document' % page
html = download3(url)
if html is None:
# 重试下载max_errors次 如果连续遇到max_errors个错误链接就停止爬虫
num_errors+=1
if num_errors == max_errors:
break
else:
num_errors = 0
1.2.3 链接爬虫
初始版本
import re
def link_crawler1(seed_url, link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download3(url)
for link in get_links(html):
print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
# re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
# 而re.search匹配整个字符串,直到找到一个匹配。
if re.search(link_regex, link):
crawl_queue.append(link)
def get_links(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
return webpage_regex.findall(html)
if __name__ == "__main__":
link_crawler1('http://example.webscraping.com', '/(index|view)')
上面得到的地址是相对链接,运行会报错
Traceback (most recent call last):
File "/home/sdv/ProjectTools/pycharm-2017.2.3/helpers/pydev/pydevd.py", line 1599, in <module>
globals = debugger.run(setup['file'], None, None, is_module)
File "/home/sdv/ProjectTools/pycharm-2017.2.3/helpers/pydev/pydevd.py", line 1026, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "/home/sdv/PycharmProjects/scraping/chapter1-4/__init__.py", line 121, in <module>
crawl_queue = [seed_url]
File "/home/sdv/PycharmProjects/scraping/chapter1-4/__init__.py", line 106, in link_crawler1
url = crawl_queue.pop()
File "/home/sdv/PycharmProjects/scraping/chapter1-4/__init__.py", line 41, in download3
html = urllib2.urlopen(request).read()
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 421, in open
protocol = req.get_type()
File "/usr/lib/python2.7/urllib2.py", line 283, in get_type
raise ValueError, "unknown url type: %s" % self.__original
ValueError: unknown url type: /places/default/index/1
我们需要将链接转换为绝对链接,使用
urlparse
模块
import urlparse
def link_crawler2(seed_url, link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download3(url)
for link in get_links(html):
print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
# re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
# 而re.search匹配整个字符串,直到找到一个匹配。
if re.search(link_regex, link):
# 地址连接 获得正确的地址
link = urlparse.urljoin(seed_url, link)
crawl_queue.append(link)
不同的页面中存在相互链接,爬虫会不断循环下去。要想避免重复爬取相同的链接,我们需要记录哪些链接已经被爬取过:
def link_crawler3(seed_url, link_regex):
crawl_queue = [seed_url]
# 跟踪哪个url之前见过 用set避免重复
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download3(url)
for link in get_links(html):
print '网址', link, '表达式', link_regex, '是否通过', re.search(link_regex, link)
# re.match只匹配字符串的开始,如果字符串开始不符合正则表达式,则匹配失败,函数返回None;
# 而re.search匹配整个字符串,直到找到一个匹配。
if re.search(link_regex, link):
# 地址连接 获得正确的地址
link = urlparse.urljoin(seed_url, link)
if link not in seen:
# 把已爬取的地址添加进set集合
seen.add(link)
crawl_queue.append(link)