1. 使用cookies登陆
在爬虫文件中重构start_url
方法,模拟登陆GitHub
先通过在网页中登陆自己的账号,获取cookies
后
在start_url
方法中构造请求对象携带cookies
,传递给引擎再传给parse
方法
import scrapy
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github.com']
start_urls = ['https://github.com/ahang1598']
# 重构start_requests方法,将cookies用字典存储
def start_requests(self):
data = '_octo=GH1.1.339554947.1578661733; _ga=GA1.2.57501551.1578661765; _device_id=850ec0241c158fe00087e1c726139c; ...'
# 将网页cookies转化为字典
cookies = {x.split('=')[0]:x.split('=')[-1] for x in data.split('; ')}
# 创建请求对象,传递cookies参数给parse方法
yield scrapy.Request(
self.start_urls[0],
callback=self.parse,
cookies=cookies
)
def parse(self, response):
tmp = response.xpath('//*[@id="js-pjax-container"]/div[2]/div/div[1]/div/div[4]/div[2]/div[1]/button/text()')
print(tmp)
2. 发送post请求登陆github
- 主要是通过scrapy.FormRequest()将表单数据提交
yield scrapy.FormRequest(
url='https://github.com/session',
callback=self.login,
formdata=post_data
)
- 爬虫文件代码
import scrapy
class GitSpider(scrapy.Spider):
name = 'git'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
# 在login文件的response中提取出post的需求数据,也就是变化的数据,目前有token、timestamp、timestamp_secret
token = response.xpath('//input[@name ="authenticity_token"]/@value').extract_first()
timestamp = response.xpath('//input[@name = "timestamp"]/@value').extract_first()
tss = response.xpath('//input[@name = "timestamp_secret"]/@value').extract_first()
# 组建form表单
post_data = {
'commit': 'Sign in',
'authenticity_token': token,
'ga_id': '1057972186.15942244',
'login': '15xxx',
'password': 'xxxx',
'webauthn-support': 'supported',
'webauthn-iuvpaa-support': 'unsupported',
'return_to': '',
'required_field_2db8': '',
'timestamp': timestamp,
'timestamp_secret': tss
}
# 提交给登陆session网页,然后跳转到登陆后的网页方法
yield scrapy.FormRequest(
url='https://github.com/session',
callback=self.login,
formdata=post_data
)
# 登陆成功后对数据请求,把数据发送给验证方法
def login(self, response):
yield scrapy.Request(
url = 'https://github.com/ahang1598',
callback=self.login_chack
)
# 对登陆成功与否验证
def login_chack(self, response):
tmp = response.xpath('//*[@id="js-pjax-container"]/div[2]/div/div[1]/div/div[4]/div[2]/div[1]/button/text()').extract()
print(tmp)