# -*- encoding:utf-8 -*-
# 作者:大帅
# 目标:获取bilibili直播所有开播房间信息并保存在本地
import requests,bs4,re,time
import xlwings as xw
from selenium import webdriver
from bs4 import BeautifulSoup
roomlist = []
# 获取bilibili动态页面
def gethtml(url):
#n = 0
try:
driver = webdriver.Chrome()
driver.get(url)
button_view = driver.find_element_by_xpath('//*[@id="room-list-section"]/div[1]/div/div[1]/div[2]')
button_view.click()
while True:
try:
button = driver.find_element_by_xpath('//*[@id="room-list-section"]/div[2]/div[1]/span[1]')
time.sleep(1)
#n += 1
button.click()
except:
break
html = driver.page_source
file_html = open(r'f:\workspace\example\livehtml.txt','w+',encoding='utf-8')
file_html.write(html)
file_html.close()
driver.quit()
return html
except:
print('执行异常')
# 获取room详情并储存在一个2维列表里
def getroomlist(html):
soup = BeautifulSoup(html,"html.parser")
for li in soup.find('section',id = 'room-list-section').ul:
# 获取房间的主播名和房间名
pre_rt = r'title=".*"'
pattern1 =re.compile(pre_rt)
rt_match =re.findall(pattern1,str(li))
try:
if len(rt_match) == 0:
continue
else:
room_title = rt_match[0].split('"')[1]
anchor_name = rt_match[1].split('"')[1]
# 获取房间的人气值
pre_view = r'>\d+
pattern2 = re.compile(pre_view)
vc_match = pattern2.search(str(li))
if vc_match == None:
continue
else:
pre_vc_value = vc_match.group(0)
vc_value = re.sub(r'>|
roomlist.append([anchor_name,room_title,vc_value])
except:
print(anchor_name)
break
return roomlist
# 将房间信息保存在xlsx文件里方便后期处理和存储
def savelivelist(livelist):
app = xw.App(visible=True,add_book=False)
app.display_alerts = False
app.screen_updating = False
wb = app.books.open(r'f:\workspace\example\live.xlsx')
sheet1 = wb.sheets['sheet1']
sheet1.range('A1').value = ['主播名','房间名','人气值']
sheet1.range('A2').value = livelist
wb.save()
wb.close()
app.quit()
print ('完成存储')
# 主函数
def main():
url = 'http://live.bilibili.com/all'
html = gethtml(url)
livelist = getroomlist(html)
#print(len(livelist))
savelivelist(livelist)
main()