scrapy 使用 selenium
以下是middlewares.py代码
from selenium import webdriver
import time
from scrapy.http.response.html import HtmlResponse
class SeleniumDownloadMiddleware(object):
def __init__(self):
chromeOptions = webdriver.ChromeOptions()
# 加载无窗口浏览器
chromeOptions.add_argument('--headless')
chromeOptions.add_argument('--disable-dev-shm-usage')
chromeOptions.add_argument('--no-sandbox') # 以根用户打身份运行Chrome,使用-no-sandbox标记重新运行Chrome,禁止沙箱启动
self.driver = webdriver.Chrome(chrome_options=chromeOptions)
def process_request(self,request, spider):
self.driver.get(request.url)
time.sleep(1)
try:
while True:
showMore = self.driver.find_element_by_class_name('show-more')
showMore.click()
time.sleep(0.3)
if not showMore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf-8')
return response