1.禁用Cookie
部分网站会通过用户的Cookie信息对用户进行识别与分析,所以要防止目标网站识别我们的会话信息。
在Scrapy中,默认是打开cookie的 (#COOKIES_ENABLED = False)
设置为:COOKIES_ENABLED = False (cookie启用:no),对于需要cookie的可以在请求头中headers加入cookie
class LagouspiderSpider(scrapy.Spider):
name = "lagouspider"
allowed_domains = ["www.lagou.com"]
url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
page = 1
allpage =0
cookie = 'JSESSIONID=ABAAABAAAFCAAEG34858C57541C1F9DF75AED18C3065736; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524281748; 04797acf-4515-11e8-90b5- LGSID=20180421130026-e7e614d7-4520-PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4302345.html; LGRID=20180421130208-24b73966-4521-11e8-90f2-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524286956'
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'cookie': cookie }
def start_requests(self):
yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
2.设置下载延时
在Scrapy中,默认是关闭请求下载延时的(#DOWNLOAD_DELAY = 3)
去掉#,或者在spider的请求间中加入 time.sleep(random.randint(5, 10))
def parse(self, response):
#print(response.text)
item = LagouItem()
data = json.loads(response.text)
totalCount = data['content']['positionResult']['totalCount']#总共多少条信息
resultSize = data['content']['positionResult']['resultSize']#每页多少条信息
result = data['content']['positionResult']['result']#得到一个包含15个信息的列表
for each in result:
for field in item.fields:
if field in each.keys():
item[field] = each.get(field)
yield item
time.sleep(random.randint(5, 10))
if int(resultSize):
self.allpage = int(totalCount) // int(resultSize) + 1
if self.page < self.allpage:
self.page += 1
yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
3.设置USER-AGENT和代理ip
settings中:
DOWNLOADER_MIDDLEWARES = {
'doubanMongo.middlewares.RandomUserAgent': 300,
'doubanMongo.middlewares.RandomProxy':400
}
USER_AGENTS = [
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
]
PROXIES=[{'ip_port':'117.48.214.249:16817','user_passwd':'632345244:4tf9pcpw'}
#{'ip_port':'117.48.214.249:16817','user_passwd':''},
#{'ip_port':'117.48.214.249:16817','user_passwd':''},
#{'ip_port':'117.48.214.249:16817','user_passwd':''}
]
middlewares中:
from scrapy.conf import settings
import base64
import random
class RandomProxy(object):
def process_request(self, request, spider):
proxy = random.choice(settings["PROXIES"])
if proxy['user_passwd'] is None:
request.meta['proxy'] = 'http://'+ proxy['ip_port']
else:
# 对账户密码进行base64编码转换
b_pw = bytes(proxy['user_passwd'], encoding = "utf-8")#string转为bytes
base64_userpasswd = base64.encodestring(b_pw)#需要的参数是bytes对象
# 对应到代理服务器的信令格式里
s_base64_userpasswd = str(base64_userpasswd, encoding="utf-8") #bytes转为string
request.headers['Proxy-Authorization'] = 'Basic ' + s_base64_userpasswd
request.meta['proxy'] = "http://" + proxy['ip_port']
class RandomUserAgent(object):
def process_request(self, request, spider):
useragent = random.choice(settings["USER_AGENTS"])
#print(useragent)
request.headers.setdefault('User-Agent',useragent)