说明:
本文参考了官网文档,以及stackoverflow的几个问题
注意:
下面这个爬虫不能实际运行!我只是用它来展示登录,以及之后如何处理。
方式一:FormRequest
import scrapy
from myprojct.items import ExampleItem
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ["example.com"]
start_urls = [
'http://www.example.com/articals/',
'http://www.example.com/blogs/',
'http://www.example.com/news/',
]
# 先登录
def start_requests(self):
return [scrapy.FormRequest("http://www.example.com/login",
formdata={'user': 'john', 'pass': 'secret'},
callback=self.login_check)]
# 检查登录。
def login_check(self, response):
if "Login failed" not in response.body: # 如果登录成功,则
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse_page)
# 解析页面
def parse_page(self, response):
for tablerow in response.css('table.basictable tr'):
item = ExampleItem()
item["name"] = tablerow.xpath('td[1]').extract()
item["handicap"] = tablerow.xpath('td[2]').extract()
item["exact"] = tablerow.xpath('td[3]').extract()
item["category"] = tablerow.xpath('td[4]').extract()
yield item
方式二:FormRequest.from_response
import scrapy
class LoginSpider(scrapy.Spider):
name = 'example.com'
start_urls = ['http://www.example.com/users/login.php']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...