- 因为在called for each request that goes through the downloader middleware。每一请求进过下载中间键时会被调用。在方法process_request 中集成。
from scrapy.http.response.html import HtmlResponse
from selenium import webdriver
class SeleniumSpiderDownloaderMiddleware(object):
"""继承selenium"""
def __init__(self):
option = webdriver.ChromeOptions()
option.add_argument('User-Agent=Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {
"profile.managed_default_content_settings.images": 2
}
option.add_experimental_option('prefs',prefs)
option.add_argument("--headless")
self.driver = webdriver.Chrome(options=option)
def process_request(self,request,spider):
self.driver.get(request.url)
time.sleep(1)
source = self.driver.page_source
if not source:
print("INFO: %s %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),source))
source =source.encode("utf-8")
Response=HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding="utf-8")
return Response