class ZufangDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): click_page_url = "http://www.XXX.com" print(request.url) initcnt = 0 trycnt = 0 while (initcnt < 3): try: initcnt += 1 if click_page_url in request.url: options = webdriver.FirefoxOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') driver = webdriver.Firefox(firefox_options=options) while (trycnt < 3) : try: trycnt += 1 driver.get(request.url) driver.implicitly_wait(5) time.sleep(5) print(driver.title) look_more = "//a[@id='goNextPeriod']" driver.find_element_by_xpath(look_more).click() time.sleep(5) print(driver.current_url) urlchangecnt = 0 while (urlchangecnt < 3) : urlchangecnt += 1 if request.url in driver.current_url: driver.find_element_by_xpath(look_more).click() time.sleep(5+urlchangecnt*2) print(driver.current_url) else: break true_page = driver.page_source # print(true_page) new_url = {'new_url': driver.current_url} driver.close() return HtmlResponse(request.url, body=true_page, headers=new_url, encoding='utf-8', request=request) except: print("get news data failed ") else: return None except: print("webdriver init failed ") return None
scrapy中selenium模拟点击下一页代码
最新推荐文章于 2024-08-05 17:23:58 发布