识别网站所用的技术
builtwith库,不一定完美支持python3,且有时会获取不到信息
使用builtwith.parse(url)来获取
寻找网站的所有者
whois库,全名是python-whois
使用whois.whois(域名)
urllib.request.urlopen()和requests.get()的比较
两者在功能上接近,都是用来获取网页的信息
但是要注意一点,urlopen的read()只能有效的读取一次,而get的text可以反复利用,所以就单纯的获取网页的信息来说,requests库比较合适
一般出现5XX的错误可以重试,因为错误发生在服务器存在问题时,
而出现4XX的错误就不用重试了,因为错误发生在请求存在问题时。
download_save.py
# -*- coding: utf-8 -*-
# 下载和写入html
import requests
import traceback
def Download(url, count=3, user_agent='wswp'): # 下载页面
try:
hd = {'user-agent': user_agent}
re = requests.get(url, headers=hd, timeout=30)
re.raise_for_status()
re.encoding = re.apparent_encoding
html = re.text
print('download {} success'.format(url))
except:
if (re.status_code >= 500) and (re.status_code < 600): # 错误代码5XX再次尝试下载
if count > 0: # 尝试次数为0时停止
return Download(url, count - 1)
elif count == 0:
print('download {} error 5XX'.format(url))
else:
print('download {} error beyond 5XX'.format(url)) # 其他错误提示
html = None
return html
def SaveHtml(url, html): # 保存页面
root = 'crawler/example/' + url.split('/')[-1] + '.html' # 保存路径+文件名
try:
with open(root, 'wt')as fp:
fp.write(html)
print('savehtml {} success'.format(url))
except:
print('savehtml {} error'.format(url))
traceback.print_exc()
if __name__ == '__main__':
url = 'http://example.webscraping.com'
try:
html = Download(url)
SaveHtml(url, html)
print('download and save {} success'.format(url))
except:
print('download or save {} error'.format(url))
网站的地图爬虫
网站的地图爬虫是利用网站robots.txt中提供的sitemap网页来实现,
sitemap可以帮助爬虫定位网站最新的内容,提供了所有网页的链接
sitemap.py
import download_save as c
import re
# 网站地图爬虫(寻找网站sitemap下的所有loc里头的链接爬虫)
def crawl_sitemap(url):
sitemap = c.Download(url) # 下载sitemap
links = re.findall(r'<loc>(.*?)</loc>', sitemap)
for link in links:
print('start {}'.format(link)) # 开始下载相关链接
try:
st = c.Download(link)
c.SaveHtml(st, link)
print('{} crawl succeed'.format(url)) # 成功
except:
print('error in {}'.format(link)) # 失败
if __name__ == '__main__':
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
ID遍历爬虫
利用网站结构的弱点,遍历ID来访问所有的网站
ID_scrapy.py
import download_save as c
import itertools
# ID遍历爬虫(按照ID分布的规律爬取)
max_errors = 5
num_errors = 0
for page in itertools.count(1): # range(1,+inf)
url = 'http://example.webscraping.com/view/{}'.format(page)
html = c.Download(url)
if html is None:
num_errors += 1
if num_errors == max_errors: # 最多允许连续max_error次网页查找失败
break
else:
c.SaveHtml(html, url)
print('{} crawl succeed'.format(url))
链接爬虫
类似于广度优先搜索BFS,从Home网页的链接往下一层层的广搜,收集所有有一定相关性的链接(可以检测搜素的链接是否违反robots协议,可以支持网站代理使用,可以支持下载限速,可以支持网页爬取深度)
link_scrapy.py
# -*- coding: utf-8 -*-
# 链接爬虫
import download_save as ds
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin
import re
import Throttle
def link_crawler(url_origin, regex, max_deep=3): # 链接爬取
que = [url_origin]
seen = dict(que=1) # 记录爬过的网址
rp = RobotFileParser() # 启动robots协议的判断
rp.read()
rp.set_url('http://example.webscraping.com/robots.txt')
throttle = Throttle.Throttle(0.1) # 下载限速0.1秒
while que:
url = que.pop()
if rp.can_fetch('wswp', url): # 判断是否违反robots协议
throttle.wait(url) # 判断是否需要延迟
html = ds.Download(url)
ds.SaveHtml(url, html)
deep = seen[url] # 读取深度
if deep != max_deep: # 判断搜索的深度是否达到阈值
for link in get_links(html): # 查找该页下的所有链接
if re.match(regex, link): # 该页下的所有相关链接
link = urljoin(url_origin, link) # 将网页转化为绝对链接
if link not in seen: # 判断该网页是否已经爬过
que.append(link)
seen[link] = deep + 1
else:
print('{} blocked by robots'.format(url))
def get_links(html): # 查找该页下的所有链接
xx = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # a标签链接的正则,忽略大小写
return xx.findall(html)
if __name__ == '__main__':
url_origin = 'http://example.webscraping.com'
link_crawler(url_origin, '/(index|view)')
Throttle.py(判断是否需要延迟类)
from urllib import parse
import datetime
import time
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = parse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.datetime.now()