使用selenium爬取斗鱼直播
以前一直知道find_elements_by_xpath速度很慢(需要加载整个文档),但是一直没有太深刻的印象,直到我写到这个爬虫,先定位父元素再循环定位里面的子元素使用xpath会报错(找不到目标元素)。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import datetime
import json
from queue import Queue
import threading
class DouyuTVSpider:
def __init__(self):
self.start_url = "https://www.douyu.com/directory/all"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome()
self.write_queue = Queue()
self.thread_list = []
self.count = 1
def get_content_list(self):
self.count += 1
if self.count>50:
return
# li_list = self.driver.find_elements_by_xpath("//ul[@class='layout-Cover-list']/li[@class='layout-Cover-item']")
time.sleep(2)
li_list = self.driver.find_elements_by_class_name('layout-Cover-item')
content = []
for li in li_list:
item = {}
# item['room_img'] = li.find_element_by_xpath(
# "//div[@class='LazyLoad is-visible DyImg DyListCover-pic']/img").get_attribute('src')
item['room_img'] = li.find_element_by_css_selector("img.DyImg-content.is-normal ").get_attribute('src')
# item['room_title'] = li.find_element_by_xpath(
# "//div[@class='DyListCover-content']/div[@class='DyListCover-info'][1]/h3").text
item['room_title'] = li.find_element_by_css_selector(
"h3.DyListCover-intro").text
# item['room_category'] = li.find_element_by_xpath(
# "//div[@class='DyListCover-content']/div[@class='DyListCover-info'][1]/span").text
item['room_category'] = li.find_element_by_css_selector(
"span.DyListCover-zone").text
# item['anchor'] = li.find_element_by_xpath(
# "//div[@class='DyListCover-content']/div[@class='DyListCover-info'][2]/h2").text
item['anchor'] = li.find_element_by_css_selector(
"h2.DyListCover-user").text
# item['hot'] = li.find_element_by_xpath(
# "//div[@class='DyListCover-content']/div[@class='DyListCover-info'][2]/span").text
item['hot'] = li.find_element_by_css_selector("span.DyListCover-hot").text
now_time = datetime.datetime.now()
item['spider_time'] = datetime.datetime.strftime(now_time,'%Y-%m-%d %H:%M:%S')
content.append(item)
self.write_queue.put(content)
print(1)
self.driver.find_element_by_css_selector("li.dy-Pagination-next").click()
self.get_content_list()
def save_content_list(self):
while True:
dict_data = self.write_queue.get()
with open("douyuTV.json", "a", encoding="utf-8") as f:
f.write(json.dumps(dict_data, ensure_ascii=False, indent=4)+",")
self.write_queue.task_done()
def run(self):
self.driver.get(self.start_url)
time.sleep(2)
self.driver.find_element_by_css_selector("span.ZoomTip-tipHide").click()
with open("douyuTV.json", "a", encoding="utf-8") as f:
f.write('{"data":[')
self.get_content_list()
for i in range(5):
t_save_news = threading.Thread(target=self.save_content_list)
self.thread_list.append(t_save_news)
for t in self.thread_list:
t.setDaemon(True)
t.start()
for q in [self.write_queue]:
q.join()
with open("douyuTV.json", "a", encoding="utf-8") as f:
f.write("]}")
self.driver.quit()
if __name__ == "__main__":
douyuTV_spider = DouyuTVSpider()
douyuTV_spider.run()
使用requests的方式爬取斗鱼直播
除非万不得已我真的不喜欢使用selenium的方式,因为它实在是实在是太慢了,而且有些情况下比requests还要麻烦得多。就这个案例来说,requests方式比selenium方式的速度快了近百倍。
import datetime
import json
import requests
import datetime
from queue import Queue
import threading
class DouyuTVSpider:
def __init__(self):
self.base_url = "https://www.douyu.com/gapi/rkc/directory/0_0/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
self.url_queue = Queue()
self.parse_queue = Queue()
self.write_queue = Queue()
self.thread_list = []
def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers)
response_data = response.content.decode()
self.parse_queue.put(response_data)
self.url_queue.task_done()
def get_content_list(self):
while True:
str_data = self.parse_queue.get()
json_data = json.loads(str_data)
live_list = json_data['data']['rl']
need_list =[]
for x in live_list:
need_dict = {key: value for key, value in x.items() if
key in ["rs16", "nn", "ol", "c2name", "rn"]}
need_dict["spider_time"] = datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')
need_list.append(need_dict)
self.write_queue.put(need_list)
self.parse_queue.task_done()
def save_content_list(self):
while True:
data = self.write_queue.get()
with open("douyuTV-update.json",'a',encoding='utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False,indent=4)+",")
self.write_queue.task_done()
def run(self):
with open("douyuTV-update.json", 'a', encoding='utf-8') as f:
f.write('{"data":[')
for x in range(1, 51):
self.url_queue.put(self.base_url + f"{x}")
for x in range(2):
t_url = threading.Thread(target=self.parse_url)
self.thread_list.append(t_url)
for x in range(3):
t_content = threading.Thread(target=self.get_content_list)
self.thread_list.append(t_content)
for x in range(4):
t_write = threading.Thread(target=self.save_content_list)
self.thread_list.append(t_write)
for t in self.thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue,self.parse_queue,self.write_queue]:
q.join()
with open("douyuTV-update.json", 'rb+') as f:
f.seek(0, 2) # end of file
size = f.tell() # the size...
f.truncate(size - 1)
with open("douyuTV-update.json", 'a', encoding='utf-8') as f:
f.write("]}")
if __name__ == "__main__":
douyuTV_spider = DouyuTVSpider()
douyuTV_spider.run()