"""爬取实时的虎牙弹幕"""
"""不出现重复弹幕"""
"""不遗漏弹幕"""
from selenium import webdriver
import time
web = webdriver.Chrome()
web.get('https://www.huya.com/52333')
"""第一版"""
# while True:
# bullets_chat = web.find_elements_by_xpath('//*[@id="chat-room__list"]/div/div/span[3]')
# for bullet in bullets_chat:
# if bullet.text:
# print(bullet.text)
# time.sleep(3)
# 缺点:会出现重复弹幕,为减少重复弹幕增加sleep时间会遗漏弹幕
"""第二版"""
# bbb = []
# while True:
# bullets_chat = web.find_elements_by_class_name('msg')
# for bullet in bullets_chat:
# if bullet.text and bullet.text not in bbb:
# print(bullet.text)
# bbb.append(bullet.text)
# time.sleep(0.5)
# 缺点 1.bbb列表会越来越大,拖慢内存 2.如果两个人发了两个相同弹幕比如哈哈哈,只会出现1个
"""第三版"""
# bbb = []
# while True:
# bullets_chat = web.find_elements_by_class_name('msg')
# for bullet in bullets_chat:
# if bullet.text and bullet.text not in bbb:
# print(bullet.text)
# bbb.append(bullet.text)
# if len(bbb)>20:
# del_num = len(bbb)-20
# del bbb[0:del_num-1]
# time.sleep(0.5)
# 缺点:如果两个人发了两个相同弹幕比如哈哈哈,只会出现1个
最后想到的解决方案是将data-cmid也提取出来,用字典存储data-cmid 和弹幕文本实现去重