模拟登录github
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy import FormRequest
class Login1Spider(scrapy.Spider):
name = 'login1'
allowed_domains = ['github.com']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://github.com/',
'Content-Type': 'application/x-www-form-urlencoded',
}
start_urls = ['https://github.com/758109577']
def start_requests(self):
urls = ['https://github.com/login']
for url in urls:
yield Request(url, meta={'cookiejar': 1}, callback=self.github_login)
def github_login(self, response):
# 首先获取authenticity_token,这里可以借助scrapy shell ”url“来获取页面
# 然后从源码中获取到authenticity_token的值
authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
self.logger.info('authenticity_token=' + authenticity_token)
# url可以从fiddler抓取中获取,dont_click作用是如果是True,表单数据将被提交,而不需要单击任何元素。
return FormRequest.from_response(response,
url='https://github.com/session',
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
formdata={'utf8': '✓',
'authenticity_token': authenticity_token,
'login': 'aaaaaa@qq.com',
'password': 'xxxxxx'},
callback=self.github_after,
dont_click=True,
)
def github_after(self, response):
# 获取登录页面主页中的字符串'Browse activity'
list = response.xpath("//a[@class='UnderlineNav-item selected']/text()").extract()
# 如果含有字符串,则打印日志说明登录成功
if 'Browse activity' in list:
self.logger.info('我已经登录成功了,这是我获取的关键字:Browse activity')
for url in self.start_urls:
yield Request(url=url, callback=self.show)
def show(self,response):
print("############################")
list = response.xpath("//span[@class='p-nickname vcard-username d-block']/text()").extract()
if 'aaaaaa' in list:
print(list)
print("############################")
else:
print("失败")
2.模拟登录51cto
items配置
vim items.py
class CtoItem(scrapy.Item):
title_url = scrapy.Field()
title = scrapy.Field()
fullname = scrapy.Field()
vim login2.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest,Request
from ..items import CtoItem
class Login2Spider(scrapy.Spider):
name = 'login2'
allowed_domains = ['51cto.com']
#start_urls = ['http://51cto.com/']
def start_requests(self):
urls = ['http://home.51cto.com/index']
for url in urls:
yield Request(url,callback=self.cto_login,meta={'cookiejar':1})
def cto_login(self,response):
csrf = response.xpath("//input[@name='_csrf']/@value").extract_first()
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language':'zh-CN,zh;q=0.9',
'Accept-Encoding':'gzip, deflate',
'Referer':'http://www.51cto.com/',
'Content-Type':'text/html; charset=UTF-8'
}
self.logger.info("获取csrf值为 %s" % csrf)
yield FormRequest.from_response(response,url='https://blog.51cto.com/haoyonghui?type=1',headers=headers,meta={'cookiejar': response.meta['cookiejar']},formdata={
'LoginForm[username]':'aaaaaaa@qq.com',
'LoginForm[password]':'xxxxxx',
'LoginForm[rememberMe]':'0',
'_csrf': csrf,
},callback=self.after_login,dont_click=True,
)
def after_login(self,response):
item = CtoItem()
#item = {}
resps = response.css('ul.artical-list li')
for resp in resps:
# 写入item字段中
item['title_url'] = resp.css("a.tit::attr(href)").extract_first()
item['title'] = resp.css("a.tit::text").extract_first().strip()
# fullname的格式为“[名称](链接)”之所以这样是因为
# markdown语法里这个表示链接的意思,点击名称直接打开链接内容
item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')'
# 此处logger也是调试使用
print("###################")
self.logger.info("title url的值为:%s , title的值为%s" % (item['title_url'], item['title']))
yield item
# 下一页内容获取
next_page = response.css('li.next a::attr(href)').extract_first()
# self.logger.info("下一页链接为:%s" % next_page)
if next_page is not None:
yield Request(next_page, callback=self.after_login)
#def parse(self, response):
#pass
转载于:https://blog.51cto.com/haoyonghui/2140888