本文主要讲述模拟登录。
scrapy是怎么开始请求的
通过scrapy genspider 爬虫名.py 域名后观察对应的python文件
import scrapy
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["17k.com"]
def parse(self, response, **kwargs):
pass
思考start_urls = ["https://17k.com"]
这个scrapy是怎么请求的。
可以看出类LoginSpider是继承scrapy.Spider。观察scrapy.Spider后发现scrapy.Spider中有一个方法start_requests。这个方法就是scrapy发送请求的地方。
def start_requests(self):
if not self.start_urls and hasattr(self, "start_url"):
raise AttributeError(
"Crawling could not start: 'start_urls' not found "
"or empty (but found 'start_url' attribute instead, "
"did you miss an 's'?)"
)
for url in self.start_urls:
yield Request(url,dont_filter=True)
这个里面默认没有cookie,scrapy.spider给的start_requests()其实是不满足需求的。按住Request进入观察
def __init__(
self,
url: str,
callback: Optional[Callable] = None,
method: str = "GET",
headers: Optional[dict] = None,
body: Optional[Union[bytes, str]] = None,
cookies: Optional[Union[dict, List[dict]]] = None,
meta: Optional[dict] = None,
encoding: str = "utf-8",
priority: int = 0,
dont_filter: bool = False,
errback: Optional[Callable] = None,
flags: Optional[List[str]] = None,
cb_kwargs: Optional[dict] = None,
) -> None:
发现cookie的定义cookies: Optional[Union[dict, List[dict]]] = None,
因此可以知道子类对父类提供的某个方法不满足要求。
这种就需要我们去重写。这需要重写start_requests().
设置cookie
针对cookie有多种设置方法。
单个请求的cookie管理
方法一:直接从浏览器中复制。
a.传入Request的cookies参数.cookie是字典。
import scrapy
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["17k.com"]
start_urls = ["https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919"]#该网站需要携带cookie才能正常访问。
'''因此需要重写scrapy.Spider里的start_requests()方法。'''
'''携带cookie有三种处理方法。scrapy里的cookie是字典源码: self.cookies = cookies or {}'''
#方法一:从浏览器中复制cookie
def start_requests(self):
cookie_str='你的cookie'
#对cookie_str做字符串切割。
cookie_list=cookie_str.split('; ')
cookie_dic={}
for list in cookie_list:
data=list.split("=")
k=data[0]
v=data[1]
cookie_dic[k.strip()]=v.strip()
yield scrapy.Request(url=self.start_urls[0],
cookies=cookie_dic)#默认回调函数是parse()
def parse(self, response, **kwargs):
print(response.text)
b.设置headers
需要把settings.py的COOKIES_ENABLED设置为false
COOKIES_ENABLED = False
def start_requests(self):
headers = {
"cookie":"填入cookie"
}
url = '请求url'
yield Request(url, callback=self.parse, headers=headers)
方法二:模拟登录流程。
scrapy请求中post参数data是不一样的。并不是字典。且名称也不一样,是body。body的值是浏览器中form data 的view source的
import scrapy
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["17k.com"]
start_urls = ["https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919"]#该网站需要携带cookie才能正常访问。
'''因此需要重写scrapy.Spider里的start_requests()方法。'''
#方法二:根据模拟登录流程,获得cookie
def start_requests(self):
#这是发送post请求的一种方案,但是请求体不好。
user='******'
pwd='******'
url='https://passport.17k.com/ck/user/login'
#注意scrapy请求中post参数data是不一样的。并不是字典。且名称也不一样,是body。body的值是浏览器中form data 的view source的
yield scrapy.Request(url=url,
method='post',
body='loginName={}&password={}'.format(user,pwd)
)
#这样模拟登录后,就需要去请求start_urls。得到start_urls的请求源码。
def parse(self, response, **kwargs):
yield scrapy.Request(url=LoginSpider.start_urls[0],callback=self.parse_detail)
def parse_detail(self, response, **kwargs):
print(response.text)
利用scrapy.FormRequest请求
data={
'loginName': '********',
'password': '*******'
}
yield scrapy.FormRequest (url=url,
method='post',
formdata=data
)
import scrapy
class LoginSpider(scrapy.Spider):
name = "login"
allowed_domains = ["17k.com"]
start_urls = ["https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919"] # 该网站需要携带cookie才能正常访问。
'''因此需要重写scrapy.Spider里的start_requests()方法。'''
# 方法二:根据模拟登录流程,获得cookie
def start_requests(self):
# 这是发送post请求的一种方案,但是请求体不好。
data={
'loginName': '*******',
'password': '*******'
}
url = 'https://passport.17k.com/ck/user/login'
# 注意scrapy请求中post参数data是不一样的。并不是字典。且名称也不一样,是body。body的值是浏览器中form data 的view source的
yield scrapy.FormRequest (url=url,
method='post',
formdata=data
)
# 这样模拟登录后,就需要去请求start_urls。得到start_urls的请求源码。
def parse(self, response, **kwargs):
yield scrapy.Request(url=LoginSpider.start_urls[0], callback=self.parse_detail)
def parse_detail(self, response, **kwargs):
print(response.text)
FormRequest 与 Request 区别
官方文档如下,在文档中,几乎看不到差别。
The FormRequest class adds a new argument to the constructor. The remaining arguments are the same as for the Request class and are not documented here.
Parameters: formdata (dict or iterable of tuples) – is a dictionary (or iterable of (key, value) tuples) containing HTML Form data which will be url-encoded and assigned to the body of the request.
简单说就是FormRequest新增加了一个参数formdata,接受包含表单数据的字典或者可迭代的元组,并将其转化为请求的body。并且FormRequest是继承Request的。
class FormRequest(Request):
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super(FormRequest, self).__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
###
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
最终我们传递的{‘key’: ‘value’, ‘k’: ‘v’}会被转化为’key=value&k=v’ 并且默认的method是POST,再来看看Request
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()