文章目录
网络爬虫简介
检测robots.txt
良性的爬虫大部分是要根据robots.txt来判断是否要爬取网站信息的
估算网站大小
估算网站的大小可以使用百度或者google的site
识别网站所用技术 ##########不建议使用该模块去判断网站的技术
detectem模块,此模块依赖docker+py3.5以上版本,运行如下命令(推荐直接使用Windows版本的docket)
docker pull scrapinghub/splash
pip install detectem
不建议使用该模块去判断网站的技术,除非你是玩不转chrome的F12
查询网站所有者
安装对应模块
pip install python-whois
使用如下代码
#!/usr/bin/env python
# encoding: utf-8
import whois
print(whois.whois('www.baidu.com'))
编写第一个爬虫
设置用户代理及重试下载
当爬虫遇到5xx错误时,尝试继续进行下载,其他的错误则不去处理,下面的代码使用了递归,等待一段时间后重试下载,并且设置尝试次数作为递归结束条件
#!/usr/bin/env python
# encoding: utf-8
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
def download(url, num_retries=2, user_agent='wswp'):
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
html = urllib.request.urlopen(request).read()
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
if __name__ == "__main__":
download("http://httpstat.us/500") # 下载错误两次后退出
网站地图爬虫
仅仅是爬取后,用re来分析里面的内容,比较亮点的就是通过request拿到对应网站的编码声明,进行decode
#!/usr/bin/env python
# encoding: utf-8
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
import re
def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset() # 比较亮点的就是通过request拿到对应网站的编码声明,进行decode
if not cs:
cs = charset
html = resp.read().decode(cs) # 用得到的网站编码格式进行decode
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
# extract the sitemap links
if sitemap is not None:
links = re.findall('<loc>(.*?)</loc>', sitemap)
# download each link
for link in links:
html = download(link)
# scrape html here
if __name__ == "__main__":
crawl_sitemap("http://example.python-scraping.com/sitemap.xml")
链接爬虫
如果需要让爬虫表现得更像普通用户,跟踪链接、访问感兴趣的内容,但是这种形式很容易下载很多没必要的网站,所以需要进行过滤,用正则表达式拿到我们需要的链接;并且很多网页之间有关联,所以还要加入去重功能
def link_crawler(start_url, link_regex):
" Crawl from the given start URL following links matched by link_regex "
crawl_queue = [start_url]
# keep track which URL's have seen before
seen = set(crawl_queue)
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
if not html:
continue
# filter for links matching our regular expression
for link in get_links(html):
# if re.match(link_regex, link):
if re.search(link_regex, link):
abs_link = urljoin(start_url, link)
print(abs_link)
if abs_link not in seen:
seen.add(abs_link)
crawl_queue.append(abs_link)
def get_links(html):
" Return a list of links from html "
# a regular expression to extract all links from the webpage
webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == "__main__":
link_crawler("http://example.python-scraping.com", '/(index|view)/')
可以看到匹配的链接和下载的结果
高级功能
解析robots
解析这个文件是避免下载禁止爬取的url,urllib自带的robotparser模块可以轻松完成这项工作
from urllib import robotparser
def get_robots_parser(robots_url):
" Return the robots parser object using the robots_url "
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
if __name__ == "__main__":
rp = get_robots_parser("http://example.python-scraping.com/robots.txt")
user_agent = 'BadCrawler'
print(rp.can_fetch(user_agent, "http://example.python-scraping.com")) # False
user_agent = 'GoodCrawler'
print(rp.can_fetch(user_agent, "http://example.python-scraping.com")) # True
支持代理
只需要在requests时候带上proxies参数(dict形式即可)
requests.get(url, headers=headers, proxies=proxies)
下载限速
Throttle类用dict记录了每个域名上次访问的时间,如果delay大于两次访问时间间隔则sleep
from urllib.parse import urlparse
import time
class Throttle:
""" Add a delay between downloads to the same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (time.time() - last_accessed)
if sleep_secs > 0:
# domain has been accessed recently
# so need to sleep
time.sleep(sleep_secs)
# update the last accessed time
self.domains[domain] = time.time()
避免进入爬虫陷阱
本质上是分页请求不断访问空的搜索结果页,直至达到最大页数,称为爬虫陷阱
想要避免陷入爬虫陷阱,一个简单的方法是记录当前网页经过了多少个链接,也就是深度,到达最大深度时,就不再向队列中添加该网页中的链接了
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
最终完成以上功能的链接爬虫
#!/usr/bin/env python
# encoding: utf-8
# 最终版本
from urllib.parse import urlparse
import time
class Throttle:
""" Add a delay between downloads to the same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (time.time() - last_accessed)
if sleep_secs > 0:
# domain has been accessed recently
# so need to sleep
time.sleep(sleep_secs)
# update the last accessed time
self.domains[domain] = time.time()
import re
import urllib.request
from urllib import robotparser
from urllib.parse import urljoin
from urllib.error import URLError, HTTPError, ContentTooShortError
# from throttle import Throttle
# from throtte import Throttle
def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None):
""" Download a given URL and return the page content
args:
url (str): URL
kwargs:
user_agent (str): user agent (default: wswp)
charset (str): charset if website does not include one in headers
proxy (str): proxy url, ex 'http://IP' (default: None)
num_retries (int): number of retries if a 5xx error is seen (default: 2)
"""
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
if proxy:
proxy_support = urllib.request.ProxyHandler({'http': proxy})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset()
if not cs:
cs = charset
html = resp.read().decode(cs)
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
def get_robots_parser(robots_url):
" Return the robots parser object using the robots_url "
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
def get_links(html):
" Return a list of links (using simple regex matching) from the html content "
# a regular expression to extract all links from the webpage
webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
proxy=None, delay=3, max_depth=4):
""" Crawl from the given start URL following links matched by link_regex. In the current
implementation, we do not actually scrapy any information.
args:
start_url (str): web site to start crawl
link_regex (str): regex to match for links
kwargs:
robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
user_agent (str): user agent (default: wswp)
proxy (str): proxy url, ex 'http://IP' (default: None)
delay (int): seconds to throttle between requests to one domain (default: 3)
max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
"""
crawl_queue = [start_url]
# keep track which URL's have seen before
seen = {}
if not robots_url:
robots_url = '{}/robots.txt'.format(start_url)
rp = get_robots_parser(robots_url)
throttle = Throttle(delay)
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
depth = seen.get(url, 0)
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
throttle.wait(url)
html = download(url, user_agent=user_agent, proxy=proxy)
if not html:
continue
# TODO: add actual data scraping here
# filter for links matching our regular expression
for link in get_links(html):
# if re.match(link_regex, link):
if re.search(link_regex, link):
abs_link = urljoin(start_url, link)
print(abs_link)
if abs_link not in seen:
seen[abs_link] = depth + 1
crawl_queue.append(abs_link)
else:
print('Blocked by robots.txt:', url)
if __name__ == "__main__":
link_regex = '/(index|view)/'
link_crawler('http://example.python-scraping.com/index',link_regex,max_depth = 1)
执行如下
为链接爬虫添加抓取回调
如果想要复用上面的代码抓取其他网站。需要添加一个callback处理抓取行为,这样每次只需要修改该函数就能针对其他的网站,下面的代码只保留了link_crawler一些功能的版本
#!/usr/bin/env python
# encoding: utf-8
import re
import urllib.request
from lxml.html import fromstring
from urllib.error import URLError, HTTPError, ContentTooShortError
from urllib.parse import urljoin
def scrape_callback(url, html):
""" Scrape each row from the country or district data using XPath and lxml """
fields = ('flag_img', 'area', 'population', 'iso', 'country_or_district', 'capital',
'continent', 'tld', 'currency_code', 'currency_name',
'phone', 'postal_code_format', 'postal_code_regex',
'languages', 'neighbours')
if re.search('/view/', url):
tree = fromstring(html)
try:
all_rows = [
tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
for field in fields]
print(url, all_rows)
except Exception as ee:
print("ee:", ee)
def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset() # 比较亮点的就是通过request拿到对应网站的编码声明,进行decode
if not cs:
cs = charset
html = resp.read().decode(cs) # 用得到的网站编码格式进行decode
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
def get_links(html):
" Return a list of links from html "
webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
max_depth=4, scrape_callback=None):
crawl_queue = [start_url]
seen = {}
data = []
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
depth = seen.get(url, 0)
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
html = download(url, user_agent=user_agent)
if not html:
continue
if scrape_callback:
data.extend(scrape_callback(url, html) or [])
for link in get_links(html):
if re.search(link_regex, link):
abs_link = urljoin(start_url, link)
print(abs_link)
if abs_link not in seen:
seen[abs_link] = depth + 1
crawl_queue.append(abs_link)
if __name__ == "__main__":
link_regex = '/(index|view)/'
link_crawler('http://example.python-scraping.com', link_regex, scrape_callback=scrape_callback)
最后可以看到得到的输出为