import time
from pprint import pprint
from selenium import webdriver
class DouYuSpider:
def __init__(self):
self.start_url = "https://www.douyu.com/directory/all"
self.driver = webdriver.Chrome()
self.driver.maximize_window()
def __del__(self):
self.driver.quit()
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath("//ul[@class='layout-Cover-list']/li")
content_list = list()
for li in li_list:
item = dict()
item['room_id'] = li.find_element_by_xpath(".//a[@class='DyListCover-wrap']").get_attribute("href")
item["room_pic"] = li.find_element_by_xpath(".//div[@class='DyListCover-imgWrap']/div/img")\
.get_attribute("src")
item["room_zone"] = li.find_element_by_xpath(".//span[@class='DyListCover-zone']").text
item["room_title"] = li.find_element_by_xpath(".//h3[@class='DyListCover-intro']").text
item["room_user"] = li.find_element_by_xpath(".//h2[@class='DyListCover-user']").text
item["room_hot"] = li.find_element_by_xpath(".//span[@class='DyListCover-hot']").text
pprint(item)
content_list.append(item)
# 获取下一页的元素
next_url = self.driver.find_element_by_xpath("//div[@class='ListFooter']/ul/li[@title='下一页']")
if next_url.get_attribute("aria-disabled") == "false":
return content_list, next_url
return content_list, False
def save_content_list(self, content_list):
pass
def run(self): # 实现主要逻辑
# 发送请求 获取响应
self.driver.get(self.start_url)
time.sleep(5)
# 可能出现网页缩放不正常弹窗
try:
self.driver.find_element_by_xpath("//span[@class='ZoomTip-tipHide']").click()
print("出现网页缩放不正常弹窗 已关闭")
except:
print("无网页缩放不正常弹窗 继续执行")
# 提取数据 提取下一页
content_list, next_url = self.get_content_list()
# 保存数据
self.save_content_list(content_list)
# 点击下一页元素 循环
while next_url:
print("-" * 66)
next_url.click()
time.sleep(5)
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
if __name__ == '__main__':
douyu = DouYuSpider()
douyu.run()