Request源码
cookies 接受一个字典
"""
This module implements the Request class which is used to represent HTTP
requests in Scrapy.
See documentation in docs/topics/request-response.rst
"""
import six
from w3lib.url import safe_url_string
from scrapy.http.headers import Headers
from scrapy.utils.python import to_bytes
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
from scrapy.http.common import obsolete_setter
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
assert isinstance(priority, int), "Request priority not an integer: %r" % priority
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
if errback is not None and not callable(errback):
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
assert callback or not errback, "Cannot use errback without a callback"
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
def _get_url(self):
return self._url
def _set_url(self, url):
if not isinstance(url, six.string_types):
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
else:
self._body = to_bytes(body, self.encoding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
@property
def encoding(self):
return self._encoding
def __str__(self):
return "<%s %s>" % (self.method, self.url)
__repr__ = __str__
def copy(self):
"""Return a copy of this Request"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta',
'encoding', 'priority', 'dont_filter', 'callback', 'errback']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
Response源码
- 一个Response对象表示一个HTTP响应,这通常是下载,并交给爬虫进行处理
- url(string)-此响应的URL
- status(integer)-响应的HTTP状态
- headers(dict)-这个响应的头
- body(str)响应体
- flags(list)是一个包含属性初始值的Response。flags列表
- request(Requestobject)-属性的初始值Response.request,request生成的响应
"""
This module implements the Response class which is used to represent HTTP
responses in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from six.moves.urllib.parse import urljoin
from scrapy.http.request import Request
from scrapy.http.headers import Headers
from scrapy.link import Link
from scrapy.utils.trackref import object_ref
from scrapy.http.common import obsolete_setter
from scrapy.exceptions import NotSupported
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError(
"Response.meta not available, this response "
"is not tied to any request"
)
def _get_url(self):
return self._url
def _set_url(self, url):
if isinstance(url, str):
self._url = url
else:
raise TypeError('%s url must be str, got %s:' % (type(self).__name__,
type(url).__name__))
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return "<%d %s>" % (self.status, self.url)
__repr__ = __str__
def copy(self):
"""Return a copy of this Response"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(self.url, url)
@property
def text(self):
"""For subclasses of TextResponse, this will return the body
as text (unicode object in Python 2 and str in Python 3)
"""
raise AttributeError("Response content isn't text")
def css(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def xpath(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if isinstance(url, Link):
url = url.url
url = self.urljoin(url)
return Request(url, callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback)
- 代理httpproxy.py源码
import base64
from six.moves.urllib.request import getproxies, proxy_bypass
from six.moves.urllib.parse import unquote
try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.exceptions import NotConfigured
from scrapy.utils.python import to_bytes
class HttpProxyMiddleware(object):
def __init__(self, auth_encoding='latin-1'):
self.auth_encoding = auth_encoding
self.proxies = {}
for type, url in getproxies().items():
self.proxies[type] = self._get_proxy(url, type)
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
return cls(auth_encoding)
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding=self.auth_encoding)
return base64.b64encode(user_pass).strip()
def _get_proxy(self, url, orig_type):
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user:
creds = self._basic_auth_header(user, password)
else:
creds = None
return creds, proxy_url
def process_request(self, request, spider):
# ignore if proxy is already set
if 'proxy' in request.meta:
if request.meta['proxy'] is None:
return
# extract credentials if present
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
request.meta['proxy'] = proxy_url
if creds and not request.headers.get('Proxy-Authorization'):
request.headers['Proxy-Authorization'] = b'Basic ' + creds
return
elif not self.proxies:
return
parsed = urlparse_cached(request)
scheme = parsed.scheme
# 'no_proxy' is only supported by http schemes
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
return
if scheme in self.proxies:
self._set_proxy(request, scheme)
def _set_proxy(self, request, scheme):
creds, proxy = self.proxies[scheme]
request.meta['proxy'] = proxy
if creds:
request.headers['Proxy-Authorization'] = b'Basic ' + creds
- UserAgent源码
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
- 自定义使用中间件
根据源码的使用方法为例子
在middleware.py文件编写自定义的中间件
同时在settings.py文件激活中间件
from fake_useragent import UserAgent
class UserAgentMiddleware(object):
def process_request(self, request, spider):
request.headers.setdefault(b'User-Agent', UserAgent().random)
class Proxy_Middleware(object):
def process_request(self, request, spider):
# request.meta['proxy'] = 'http://111.202.37.195:47818'
request.meta['proxy'] = 'http://user:passwd@111.202.37.195:47818'
cookie登录
最简单的方式
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class DlSpider(scrapy.Spider):
name = 'dl'
# allowed_domains = ['daili.com']
start_urls = ['http://httpbin.org/get']
def start_requests(self):
url = 'http;//examp/login.html'
cookie_str = '__DAYU_PP=YIiviVjQaVb27EYRYQm2286123883f98; _zap=17f2ed49-cab3-446f-a24f-562707e8e87d; d_c0="AJBi4gZopQ2PThbvLKvPfUPUsTN5OOkXhsE=|1527218531"; _xsrf=Y43u3ukBMTDD3VN6CGxnbcg9SXREVsmM; tst=r; _ga=GA1.2.1326493332.1528083136; l_n_c=1; q_c1=7e24a9a55efc4e118f554134bbc47221|1540804442000|1522282079000; r_cap_id="NzUxNWNjNTIwNWRkNDcyOTk5YmMxM2FiNTU2MjRjMjA=|1540804441|5756b8e730a9e5e6eca23eb36ca3dce45a8d0577"; cap_id="YjU5NWI4MjVmZDQ1NDE3YThmMDNjOWU1N2QwMThiZWY=|154080z_c0="2|1:0|10:1540949886|4:z_c0|92:Mi4xTFhBSkJnQUFBQUFBa0dMaUJtaWxEU1lBQUFCZ0FsVk5mbFhHWEFBQ3RULVdhMUd4czZLbjVqcEFNcHNfRmVKaDN3|081f463f8ef38b3bfc7274f64b9e65db8055677fe53266c43c3b2018d7edf13a"'
cookies = {}
for cookie in cookie_str.split(';'):
key, value = cookie.split('=', 1)
cookies[key.strip()] = value.strip()
yield scrapy.Request(
url=url,
cookies=cookies,
callback=self.parse
)
def parse(self, response):
print(response.text)