废话不多说,上代码
import json
import time
from selenium import webdriver
class douYuSpider():
def __init__(self):
self.start_url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.PhantomJS()
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath('//ul[@id="live-list-contentbox"]/li')
content_list = []
for li in li_list:
content = {}
content['title'] = li.find_element_by_xpath('./a[@class="play-list-link"]').get_attribute('title')
content['img'] = li.find_element_by_xpath('./a[@class="play-list-link"]/span/img').get_attribute(
'data-original')
content['game_name'] = li.find_element_by_xpath('./a[@class="play-list-link"]/div/div/span').text
content['anchor_name'] = li.find_element_by_xpath('./a/div/p/span[@class="dy-name ellipsis fl"]').text
content['watch_num'] = li.find_element_by_xpath('./a/div/p/span[@class="dy-num fr"]').text
print(content)
content_list.append(content)
next_url = self.driver.find_elements_by_xpath('//a[@class="shark-pager-next"]')
next_url = next_url[0] if len(next_url) > 0 else None
print('------当前页完成------')
return content_list, next_url
def save_content_list(self, content_list):
with open('斗鱼.txt', 'a')as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False, indent=2))
f.write('\n')
print('保存完成')
def run(self):
self.driver.get(self.start_url)
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
while next_url:
next_url.click()
time.sleep(5)
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
self.driver.quit()
if __name__ == '__main__':
douyu = douYuSpider()
douyu.run()
期间碰到个小问题,selenium 爬取网页时,会报以下的错误:selenium.common.exceptions.StaleElementReferenceException: Message: {"errorMessage":"Element does not exist in cache, 即在cache中找不到元素,可能是在元素被找到之后页面变换了。 这就说明,当前页面发生跳转之后,存在cache中的与这个页面相关的元素也被清空了,我把timesleep的等待时间设置的长一点5s,就没有报错。