scrapy简单的反爬虫方法总结
1.设置user_agent
@简单设置方法-在setting中设备ua_list,并在
在setting中设置user_agent,
ua_list = [
‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36’,
‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0’
]
在spider文件中 from … import ua_list
在类中放置headers
headers = {‘User-Agent’: random.choice(ua_list)}
但是header只在类中获取的时候random.chioce一次,以后Request中均没有再随机改变过,所以需要放在函数中去影响request_headers
def parse(self, response): #切记加入header,不然会导致重定向
nodes = response.xpath('//a[@class="position_link"]/@href').extract()
for node in nodes:
self.headers['User-Agent'] = random.choice(ua_list)#在函数中加入random.choice(ua_list)
yield scrapy.Request(url=node,callback=self.parse_job,headers=self.headers) # meta是额外内容
print(self.headers)
# 获取页面下一页链接,并调用回调函数parse,重复循环,形成递归函数
next_url = response.xpath('//a[@class="page_no"][contains(text(),"下一页")]/@href').extract_first("")
if next_url: #提取下一页的
yield scrapy.Request(url=next_url, headers=self.headers,callback=self.parse)
time.sleep(2)
pass
@在middleware中设置headers,middleware是spider、downlader、scheduler与engine之间的钩子框架,利用中间件进行requeste和response之间的处理
1.在setting中 import UserAgenT,并设置好UserAgent。
ua = UserAgent()
UA = ua.random
2.在from lagou_spider.settings import UA,
class RandomUserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
#随机更换user_agent
def __init__(self):
self.useragent = UA
def process_request(self, request, spider):
UseraGent =self.useragent
request.headers.setdefault('User-Agent', self.useragent)
3.在settings中开启:
DOWNLOADER_MIDDLEWARES ‘{lagou_spider.middlewares.RandomUserAgentMiddleware’: 1}
2.设置proxy
@首先先定制爬虫爬取proxy先,切记传给scrapy的proxy格式
return ‘http://{0}:{1)’.format(ip,port)
proxy爬虫:
import requests
from bs4 import BeautifulSoup
import pymysql
import time
class GetIP(object):
url = r'http://www.xicidaili.com/wn/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:54.0) Gecko/20100101 Firefox/54.0'}
def get_ip(self,next_page = '/wn/1'):
#构造下一页url
next_page_url = r'http://www.xicidaili.com'+str(next_page)
self.process_page_info(next_page_url)
response = requests.get(next_page_url, headers=self.header).text
soup = BeautifulSoup(response, 'html.parser')
next_page = soup.find('a', attrs={'class', 'next_page'})['href']
if next_page: #如果存在next_page
self.get_ip(next_page)
#每一页下载ip
def process_page_info(self,url):
response = requests.get(url, headers=self.header).text
soup = BeautifulSoup(response, 'html.parser')
conn,cur = self.connect_db()
ip_info_list = soup.findAll('tr',attrs={'class', 'odd'})
for ip_info in ip_info_list:
ip = ip_info.find_all('td')[1].text
port = ip_info.find_all('td')[2].text
ty = ip_info.find_all('td')[5].text
para = (ip,port,ty)
print(para)
#插入每一项ip进入数据库,加入ip有效判断
if self.check_proxy(ip,port,type):
time.sleep(1)
self.insert_db(cur,conn,para)
def check_proxy(self,ip,port,type):
proxy = str(ip)+':'+str(port)
proxy_dict = {type:proxy}
resp = requests.get(url, headers=self.header, proxies=proxy_dict)
code = resp.status_code
if code == 200:
print('True')
return True
else:
print('False')
return False
def connect_db(self):
conn = pymysql.connect(host='localhost', user='root', passwd='zhangxinwoaini', charset='utf8', db='lagou')
cur = conn.cursor()
return conn,cur
def insert_db(self,cur,conn,para):
insert_sql ='''
insert into ip_db(ip,port,type)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE ip=VALUES (ip)
'''
cur.execute(insert_sql,para)
conn.commit()
def get_ip_from_db(self):
conn,cur = self.connect_db()
get_ip_sql = '''
select ip,port,type from ip_db ORDER by rand() limit 1
'''
result = cur.execute(get_ip_sql)
for ip_info in cur.fetchall():
ip = ip_info[0]
port = ip_info[1]
ty = ip_info[2]
#返回需要的proxy
return 'http://{0}:{1)'.format(ip,port)
@在middleware中添加proxy
class RandomProxyMiddleware(object):
def process_request(self, request, spider):
ip = GetIP().get_ip_from_db()
request.meta["proxy"] = ip
pass
@在settings中加入该middleware
DOWNLOADER_MIDDLEWARES = {
'lagou_spider.middlewares.RandomUserAgentMiddleware': 1,
'lagou_spider.middlewares.RandomProxyMiddleware': 2,
}
3.设置延迟下载,在settings中设置延时
DOWNLOAD_DELAY = 1
4.使用打码平台,人工打码和自己识别