场景:
获取DY直播间弹幕,并分类保存
安装第三方包:
经过调研,此处主要使用的是webdriver相关的包,也就是selenium,以及protobuf解析相关的包,如下:
//安装google的protobuf
pip install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com protobuf
//安装webdriver相关的selenium
pip install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com selenium
在本地下载和自己chrome浏览器版本号对应的chromedriver的可执行文件(.exe),各个版本的webdriver下载链接如下:http://chromedriver.storage.googleapis.com/index.html
代码实现:
下面是主类的实现过程,注意下面的chromdriver的可执行文件存放的路径要修改成本地下载的chromedriver文件在自己本地电脑上存放的对应位置,如下图:
#!/user/bin/env python
from selenium import webdriver
import time
import json
from queue import Queue
import base64
import datetime
from selenium.webdriver.chrome.options import Options
from proto import message_pb2
class GiftTks():
'''
thanks to the gift sender
'''
def __init__(self):
super(GiftTks, self).__init__()
@classmethod
def run(cls, url, gift_queue):
chromeOpitons = Options()
# 使用无头谷歌浏览器模式
chromeOpitons.add_argument('--headless')
chromeOpitons.add_argument('--disable-gpu')
chromeOpitons.add_argument('--no-sandbox')
chromeOpitons.add_argument("--start-maximized")
# 无痕隐身模式
chromeOpitons.add_argument('--incognito')
# 禁用缓存
chromeOpitons.add_argument("disable-cache")
# 忽略证书错误
chromeOpitons.add_argument('--ignore-certificate-errors')
# chromeOpitons.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# chromeOpitons.add_experimental_option('excludeSwitches', ['enable-automation'])
chromeOpitons.add_experimental_option(
"excludeSwitches", [
'enable-automation', 'enable-logging'])
chromeOpitons.add_experimental_option('w3c', False)
# chromeOpitons.add_argument(
# 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36')
caps = {
'browserName': 'chrome',
'loggingPrefs': {
'browser': 'ALL',
'driver': 'ALL',
'performance': 'ALL',
},
'goog:chromeOptions': {
'perfLoggingPrefs': {
'enableNetwork': True,
},
'w3c': False,
},
}
driver = webdriver.Chrome(
executable_path="D:\\tools\\python3.8\\chromedriver.exe",
desired_capabilities=caps,
options=chromeOpitons)
# driver.set_page_load_timeout(30)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.get(url)
while True:
for i in driver.get_log('performance'):
log = json.loads(i['message'])['message']
if log['method'] == 'Network.responseReceived' and "params" in log.keys():
_url = log['params']['response']['url']
_requestId = log['params']['requestId']
if str(_url).startswith(
r'https://live.douyin.com/webcast/im/fetch/?aid='):
content = {}
try:
# print(_url)
# print(_requestId)
content = driver.execute_cdp_cmd('Network.getResponseBody',
{'requestId': _requestId})
except Exception:
print(_requestId)
if 'base64Encoded' in content and content['base64Encoded'] is True:
_body = content['body']
_bs = base64.b64decode(_body)
response = message_pb2.Response()
response.ParseFromString(_bs)
for msg in response.messages:
if msg.method == 'WebcastGiftMessage':
_gift_msg_response = message_pb2.GiftMessage()
_gift_msg_response.ParseFromString(msg.payload)
print('{} {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
_gift_msg_response.common.describe))
gift_queue.put(
'{} {}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
_gift_msg_response.common.describe))
time.sleep(1)
if __name__ == '__main__':
//直播间链接
url = "https:xxxxx"
gift_queue=Queue()
GiftTks.run(url, gift_queue)
总结:
从对google的headless模式的一知半解到开始使用google的headless模式进行技术调研并解决问题,也对浏览器技术的快速发展有了一个全新的认识,所谓的无头模式其实就是模拟浏览器的行为只是不需要再打开一个网页,然后根据这个页面的网络请求响应数据,进行拦截获取对应的数据