爬虫采用python3 windows平台上
chapter 网页抓取
1、背景调查
包含内容:
(1、检查robots.txt
大多数网站都会定义robots.txt文件,了解具体有哪些限制。主要方式,URL/robots.txt查看
(2、检查网站地图,sitemap
通过robots.txt中,提供的sitemap。不过一般网站没有提供,用处不大。
(3、估算网站大小
估算网站大小,经常通过搜索引擎搜索下目标网页,会显示查到的数量,能够得到大致网页数量。
(4、识别网站所用的技术
应用builtwith包,可以输出网站所用的技术和,然后我们根据网站所用的技术去分析网页。
import builtwith
builtwith.parse(‘https://www.cnblogs.com/ironstark/p/5303924.html’)
废话不多说,开始写爬虫了。
2、爬取网页的基本方法
直接上代码
import urllib.request as rq
import re
import itertools
import urllib.parse as urlparse
import urllib.robotparser
def download(url, user_agent='wswp', proxy =None ,num_retries=2):
print('Downloading:',url)
headers = {'User-agent':user_agent} #使用代理
request = rq.Request(url,headers=headers)
opener = rq.build_opener() #使用绝对路径访问
if proxy : #使用支持的代理
proxy_params = {rq.urlparse(url).scheme:proxy}
opener.add_handler(rq.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except rq.URLError as e:
print('Download error :' ,e.reason)
html = None
if num_retries >0: #当访问网页出现504等由于网页延迟等原因,重新下载,默认2次
if hasattr(e,'code') and 500 <= e.code <600:
html = download(url,user_agent,proxy,num_retries-1)
return html
3、偏离爬虫
(1)ID遍历法
id遍历法适用于,id是数值且id之间相隔不远,有一定规律的网页。
#maxnum number of consecutive download errors allowed
max_error = 5
#current number of consecutive download error
n = 0
num_error = 0
for page in itertools.count(1):
url = 'http://example.webscraping.com/view/-%d' %page
html = download(url)
if html is None:
num_error +=1
if num_error == max_error:
break
else:
num_error = 0
n +=1
if n>=5:
break
print('download success')
(2) 链接爬取
优点:通过链接爬虫可以更像用户登录一样,这样可以让网页爬虫更好的工作,可以爬一些不能连续的网页
确定:会下载一些我们不需要的网页
def link_crawler(seed_url,link_regex):
crawl_queue = [seed_url]
seen = set(crawl_queue) #去除重复下载
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(like_regex,link):
link = urlparse.urljoin(seed_url,link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
def get_links(html): #获取链接的url
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']')
return webpage_regex.findall(html)
4、爬虫的高级功能
(1)解析robots.txt文件,避免下载禁止爬取的url,使用 import urllib.robotparser
可以在
def link_crawler(seed_url,link_regex):
crawl_queue = [seed_url]
seen = set(crawl_queue) #去除重复下载
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(like_regex,link):
link = urlparse.urljoin(seed_url,link)
if link not in seen:
seen.add(link)
crawl_queue.append(link)
def get_links(html): #获取链接的url
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']')
return webpage_regex.findall(html)
(2)限定爬取深度,以免进入爬虫陷阱
5、整合代码
# -*- coding: utf-8 -*-
"""
Created on Sat May 25 20:09:01 2019
@author: Administrator
"""
import re
import urllib.request as rq
import urllib.parse as urlparse
import time
from datetime import datetime
import urllib.robotparser as robotparser
import queue
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = queue.deque([seed_url])
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
depth = seen[url]
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print ('Blocked by robots.txt:', url)
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers={}, proxy=None, num_retries=2, data=None):
print ('Downloading:', url)
request = rq.Request(url, data, headers)
opener = rq.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(rq.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except rq.URLError as e:
print ('Download error:', e.reason)
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(str(html))
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='wswp')
chapter 2 数据的抓取
网页抓取了,那怎么抓取网页上的数据呢?抓取数据主要有三种方法,正则表达式、Beautiful Soup和lxml三种方法。
抓取方法 | 性能 | 使用难度 | 优点 | 缺点 |
---|---|---|---|---|
正则表达式 | 快 | 困难 | 快,使用面广 | 难,程序脆弱 |
Beautiful Soup | 慢 | 使用简单 | 慢 | |
LXML | 较快 | 较容易 | 使用相对简单,速度快 | – |
一、