爬虫之selenium爬取斗鱼网站
示例代码:
from selenium import webdriver
import time
class Douyu(object):
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
def parse_data(self):
time.sleep(3)
room_list = self.driver.find_elements_by_xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a')
print(len(room_list))
data_list = []
for room in room_list:
temp = {}
temp['title'] = room.find_element_by_xpath('./div[2]/div[1]/h3').text
temp['type'] = room.find_element_by_xpath('./div[2]/div[1]/span').text
data_list.append(temp)
return data_list
def save_data(self, data_list):
for data in data_list:
print(data)
def run(self):
# url
# driver
# get
self.driver.get(self.url)
# 用于判断首页是否有广告
try:
time.sleep(6)
self.driver.find_element_by_xpath('/html/body/div[2]/span[1]').click() 【此处需要等待几秒把弹窗关闭】
except Exception as e:
print(e)
while True:
# parse
data_list = self.parse_data()
# save
self.save_data(data_list)
# next
try:
# el_next = self.driver.find_element_by_xpath('//*[contains(text(),"下一页")]')
el_next = self.driver.find_element_by_xpath('//*[@id="listAll"]/section[2]/div[2]/div/ul/li[9]/span')
self.driver.execute_script('scrollTo(110,100000)')
el_next.click()
except:
break
if __name__ == '__main__':
douyu = Douyu()
douyu.run()
运行效果:
思路用图: