一、构建爬虫工具
1.urlib构建爬虫
from urllib import request
url = 'https://movie.douban.com/cinema/later/wuhan/'
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
req=request.Request(url,headers=header)
resp = request.urlopen(req)
con = resp.read().decode('utf-8')
print(con)
2、requests
import requests
login_url = 'https://chengxin.mail.163.com/entrance.integrity/auth/login.do?email=%s&password=%s'
session = requests.session()
url=login_url %(email,password)
r = session.post(url,headers=base_headers)
if r.json()['code'] == 200 and r.json()['message']=='SUCCESS':
return True
except Exception as e:
print('login error',str(e))
return False
二、反爬虫与反爬虫解决方案
1.robot协议作用
用来告知搜索引擎哪些页面能被抓取,哪些页面不能被抓取;可以屏蔽一些网站中比较大的文件,如:图片,音乐,视频等,节省服务器带宽
比如:https://www.csdn.net/robots.txt 文件内容
User-agent: *
Disallow: /scripts
Disallow: /public
Disallow: /css/
Disallow: /images/
Disallow: /content/
Disallow: /ui/
Disallow: /js/
Disallow: /scripts/
Disallow: /article_preview.html*
Disallow: /tag/
Disallow: /*?*
Disallow: /link/
Sitemap: http://www.csdn.net/article/sitemap.txt
#解析robots.txt文件方法
from urllib import robotparser
url='https://www.csdn.net/robots.txt'
robot=robotparser.RobotFileParser()
robot.set_url(url)
robot.read()
print(robot.can_fetch("*",'/ '))
print(robot.can_fetch("*",'/link/'))
2.反爬虫的方案
(1)收集user-agent,爬虫过程中更换user-agent,可以使用fake-useragent
(2)设置请头referer
(3)构建IP代理池通过免费代理站点
http://www.66ip.cn/mo.php?sxb=&tqsl=2000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=
三、实例
spider.py
#coding:utf-8
from xjmfc.spider_douban import spiderDouban
from threading import Thread
from queue import Queue
def startSpider(ipq):
base_url = 'https://movie.douban.com/explore#!type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=0'
spider = spiderDouban(base_url, ipq=ipq)
spider.start_download()
if __name__ == '__main__':
ipqueue = Queue()
ipqueue.put('101.231.104.82:80')
ipqueue.put('218.27.204.240:8000')
ipqueue.put('218.27.136.169:8085')
t = Thread(target=startSpider, args=(ipqueue,))
t.start()
while True:
pass
spider_douban.py
# coding:utf-8
import re
from urllib import request
from bs4 import BeautifulSoup
from fake_useragent import FakeUserAgent
class spiderDouban(object):
def __init__(self, url=None, ipq = None):
self.starturl = url
self.ua = FakeUserAgent()
self.ipqueue = ipq
self.ips = []
self.opener = None
self.reqnum = 0
self.iterips = None
self.curip = None
def getipfrom_ips(self):
if self.ipqueue:
while len(self.ips) < 3:
try:
ip = self.ipqueue.get(timeout = 1)
self.ips.append(ip)
except:
print ('no proxy')
break
def start_download(self):
url = self.starturl
self.getipfrom_ips()
while url:
print('pageurl:', url)
url = self.load_page(url)
for ip in self.ips:
self.ipqueue.put(ip)
def get_proxyip(self):
if self.iterips == None:
self.iterips = iter(self.ips)
try:
ip = next(self.iterips)
return ip
except:
if self.ips:
self.getipfrom_ips()
self.iterips = iter(self.ips)
ip = next(self.iterips)
return ip
def change_proxy_ip(self):
ip = self.get_proxyip()
if ip:
try:
proxyhd = request.ProxyHandler({"http" : ip})
self.opener = request.build_opener(proxyhd)
self.curip = ip
except Exception as e:
return False;
return True
return False
def req_page(self, url):
req = None
if self.reqnum % 10 == 0:
self.change_proxy_ip()
while True:
try:
headinfo = {'User-Agent':self.ua.random}
reqhd = request.Request(url, headers=headinfo)
req = self.opener.open(reqhd,timeout=5)
self.reqnum += 1
break
except Exception as e:
print('catch e:', e)
self.ips.remove(self.curip)
self.curip = None
if not self.change_proxy():
return None
if req.code != 200:
return
pageinfo = req.read().decode('utf-8')
return pageinfo
def parse(self, url, mname):
pinfo = self.req_page(url)
if not pinfo:
return
print(pinfo)
def find_nextpage(self, obj):
nexturl = None
if obj:
nextpage = obj.find('span', class_="next")
if nextpage:
a = nextpage.find('a')
if a:
nexturl = a.get('href')
return nexturl
def load_page(self, url):
pinfo = self.req_page(url)
if not pinfo:
return
print(pinfo)
obj = BeautifulSoup(pinfo, 'html.parser')
''''
解析页面获取下一页 略
'''
return self.find_nextpage(obj)