python3下使用scrapy实现模拟用户登录与cookie存储 —— 基础篇(马蜂窝)
1. 背景
2. 环境
- 系统:win7
- python 3.6.1
- scrapy 1.4.0
3. 标准的模拟登陆步骤
- 第一步:首先进入用户登录的页面,拿到一些登录所需的参数(比如说知乎网站,登陆页面里的 _xsrf)。
- 第二步:将这些参数,和账户密码,一起post到服务器,登录。
- 第三步:检查用户登录是否成功。
- 第四步:如果用户登录失败,排查错误,重新启动登录程序。
- 第五步:如果用户登录成功,按照正常流程爬取网站页面。
import scrapy
import datetime
import re
class mafengwoSpider(scrapy.Spider):
custom_settings = {
'LOG_LEVEL': 'DEBUG',
'ROBOTSTXT_OBEY': False,
'DOWNLOAD_DELAY': 2,
'COOKIES_ENABLED': True,
'COOKIES_DEBUG': True,
'DOWNLOAD_TIMEOUT': 25,
}
name = 'mafengwo'
allowed_domains = ['mafengwo.cn']
host = "http://www.mafengwo.cn/"
username = "13725168940"
password = "aaa00000000"
headerData = {
"Referer": "https://passport.mafengwo.cn/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
def start_requests(self):
print("start mafengwo clawer")
mafengwoLoginPage = "https://passport.mafengwo.cn/"
loginIndexReq = scrapy.Request(
url = mafengwoLoginPage,
headers = self.headerData,
callback = self.parseLoginPage,
dont_filter = True,
)
yield loginIndexReq
def parseLoginPage(self, response):
print(f"parseLoginPage: url = {response.url}")
loginPostUrl = "https://passport.mafengwo.cn/login/"
yield scrapy.FormRequest(
url = loginPostUrl,
headers = self.headerData,
method = "POST",
formdata = {
"passport": self.username,
"password": self.password,
},
callback = self.loginResParse,
dont_filter = True,
)
def loginResParse(self, response):
print(f"loginResParse: url = {response.url}")
routeUrl = "http://www.mafengwo.cn/plan/route.php"
yield scrapy.Request(
url = routeUrl,
headers = self.headerData,
meta={
'dont_redirect': True,
},
callback = self.isLoginStatusParse,
dont_filter = True,
)
def isLoginStatusParse(self, response):
print(f"isLoginStatusParse: url = {response.url}")
yield scrapy.Request(
url = "https://www.mafengwo.cn/travel-scenic-spot/mafengwo/10045.html",
headers=self.headerData,
)
def parse(self, response):
print(f"parse: url = {response.url}, meta = {response.meta}")
def errorHandle(self, failure):
print(f"request error: {failure.value.response}")
def closed(self, reason):
finishTime = datetime.datetime.now()
subject = f"clawerName had finished, reason = {reason}, finishedTime = {finishTime}"
E:\Miniconda\python.exe E:/documentCode/scrapyMafengwo/start.py
2018-03-19 17:03:54 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapyMafengwo)