方法一:暴力登陆:
直接将登陆后的cookie提取出来,带着cookie请求数据,注意一点,header浏览器不要将登陆关闭,否则cookie会有变动
cookie = {'ps': 'y', 'bid': 'VPb0WSOJ764', 'dbcl2': '"163088717:nZorm3cicLo"'} # 带着Cookie向网页发请求\ headers = { 'Connection': 'keep - alive', # 保持链接状态 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.6 Safari/537.36' } def start_requests(self): yield Request(url=self.start_urls[0],headers=self.headers,cookies=self.cookie)# 这里带着cookie发出请求
方法二:带着post参数请求登陆:
#coding:utf-8 import scrapy from scrapy.http import Request,FormRequest import urllib2,urllib import webbrowser class LoginSpider(scrapy.Spider): name = 'login' allowed_domains = ['douban.com'] #start_urls = ['http://douban.com/'] header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"} def start_requests(self): return [FormRequest("https://accounts.douban.com/login",headers=self.header,meta={"cookiejar":1},callback=self.parse)] def parse(self, response): captcha = response.xpath("//img[@id='captcha_image']/@src").extract_first() id=response.xpath('//input[@name="captcha-id"]/@value').extract_first() print id if captcha: print(u"此时有验证码.") localpath = "E:\\0930\\captcha.png" urllib.urlretrieve(captcha,filename=localpath) webbrowser.open(localpath) captchar_value=raw_input(u"查看验证码是:") return [FormRequest.from_response(response, meta={'cookiejar': response.meta['cookiejar']}, headers=self.header, formdata={ 'source':'None', 'form_email': '183****4915', 'form_password': '****', 'captcha-solution': captchar_value, 'captcha-id': id, 'user_login': u'登录', 'redir':'https://www.douban.com/' }, callback=self.after_login, dont_filter=True)] else: return [FormRequest.from_response(response,meta={'cookiejar': response.meta['cookiejar']},headers=self.header, formdata={ 'source': 'None', 'form_email': '183****4915', 'form_password': '*****', 'user_login': u'登录', 'redir':'https://www.douban.com/people/163088717/' }, callback=self.after_login, dont_filter=True)] def after_login(self, response): print response.xpath('//title/text()').extract_first()