from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from lxml import etree
import xlwt
def set_style(name,height,bold=False):
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = name
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
def clear_page():
"""
功能:下拉到底端时,视频以小窗形式播放,关闭
"""
try:
driver.find_element('//*[@id="bili-search"]/div[7]/div/div/div[1]/svg').click() # 找到x按钮,关闭视频
except:
pass
def Transfer(driver):
'''
下拉滑动条到页面底端,实现翻页功能
'''
try:
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "") ## 移动到页面最底部
time.sleep(1)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
time.sleep(1)
driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
except:
pass
return "Transfer successfully \n"
def get_urls(name):
'''
参数:需要爬取的主题名称
功能:获取与主题相关的所有视频的url
'''
urls = []
print('开始爬取评论')
driver.get(f'https://search.bilibili.com/all?keyword={name}&page=1') # 打开第一页的视频
page_text = driver.page_source # 获取当前页面的源码
tree = etree.HTML(page_text)
try:
pages = int(
tree.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div[3]/div/ul/li[8]/button/text()')[0]) # 找出有多少页视频
except:
pages = int(tree.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div[3]/div/ul/li[4]/button/text()')[0])
print(f"当前视频共有{pages}页")
for page in range(1, pages + 1):
print(f"爬取第{page}页url")
driver.get(f'https://search.bilibili.com/all?keyword={name}&page={page}')
Transfer(driver)
page_text = driver.page_source
tree = etree.HTML(page_text)
try:
if page == 1:
for i in range(1, 1000):
url = "https:" + (
tree.xpath(f'/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul[2]/li[{i}]/a/@href')[0]) # 第一页视频方式
urls.append(url)
else:
for i in range(1, 1000):
url = "https:" + (
tree.xpath(f'//*[@id="all-list"]/div[1]/ul/li[{i}]/div/div[1]/a/@href')[0]) # 其他页视频获取方式
urls.append(url)
except:
pass
print("共" + str(pages) + "页视频")
print('urls爬取完毕')
return urls
def get_content(urls):
'''
:param urls: 从urls中获取每个视频的url,获取其评论,用户名,评论时间等信息
:return: 将其写入数据库中
'''
uesr_name_list = []
uesr_mid_list = []
uesr_time_list = []
uesr_comment_list = []
print('开始爬取视频评论')
video_sum = 1
for url in urls:
print(url)
print('第' + str(video_sum) + "条视频")
video_sum = video_sum + 1
driver.get(url)
Transfer(driver) # 下拉,以看到评论总数
time.sleep(4)
try:
time.sleep(10)
comment_sum = driver.find_element_by_css_selector(
'#comment > div > div.b-head > span.b-head-t.results').text
if comment_sum != "":
print(comment_sum)
else:
comment_sum = 0
except:
comment_sum = 0
try:
video_title = driver.find_element_by_css_selector('#viewbox_report > h1 > span').text
viedo_like = driver.find_element_by_css_selector('#arc_toolbar_report > div.ops > span.like').get_attribute(
"title")
viedo_coin = driver.find_element_by_css_selector('#arc_toolbar_report > div.ops > span.coin').get_attribute(
"title")
viedo_collect = driver.find_element_by_css_selector(
'#arc_toolbar_report > div.ops > span.collect').get_attribute(
"title")
except:
pass
print(viedo_like)
print(video_title)
print(viedo_coin)
print(viedo_collect)
print("当前视频下共" + str(comment_sum) + "评论")
print("---" * 20)
for i in range(int(int(comment_sum) / 20) + 3):
Transfer(driver)
for e in range(1, int(comment_sum)):
try:
uesr_mid = driver.find_element_by_css_selector(
f"#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.user > a.name").get_attribute(
"data-usercard-mid")
except:
uesr_mid = ''
break
try:
user_name = driver.find_element_by_css_selector(
f'#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.user > a.name').text
# /html/body/div[2]/div[4]/div[1]/div[6]/div/div[2]/div/div[4]/div[1]/div[2]/div[1]/a[1]
##comment > div > div.comment > div > div.comment-list > div:nth-child(17) > div.con > div.user > a.name
#
except:
user_name = ''
try:
comment_text = driver.find_element_by_xpath(
f'//*[@id="comment"]/div/div[2]/div/div[4]/div[{e}]/div[2]/p').text
except:
comment_text = ''
try:
comment_time = driver.find_element_by_css_selector(
f'#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.info > span.time').text
except:
comment_time = ''
uesr_name_list.append(user_name)
uesr_mid_list.append(uesr_mid)
uesr_time_list.append(uesr_time_list)
uesr_comment_list.append(comment_text)
write_excel(i=video_sum,name=uesr_name_list,mid=uesr_mid,date=uesr_time_list,commment=uesr_comment_list)
print(uesr_name_list)
print(uesr_mid_list)
print(uesr_time_list)
print(uesr_comment_list)
print("---" * 10)
def set_style(name,height,bold=False):
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = name
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
#写Excel
def write_excel(i,name,mid,date,commment):
f = xlwt.Workbook()
sheet1 = f.add_sheet(f'{i}',cell_overwrite_ok=True)
row0 = ["用户名","mid","评论日期","评论"]
colum0 = name
colum1 = mid
colum2 = date
colum3 = commment
#写第一行
for i in range(0,len(row0)):
sheet1.write(0,i,row0[i],set_style('Times New Roman',220,True))
#写第一列
for i in range(0,len(colum0)):
sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
for i in range(0,len(colum1)):
sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
for i in range(0,len(colum2)):
sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
for i in range(0,len(colum3)):
sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
f.save('C:\\Users\\Administrator\\Desktop\\test.xls')
def sum_id():
'''
:return: 返回评论的总数
'''
global sum
sum = sum + 1
return sum
if __name__ == '__main__':
sum = 0
# theme_name = input("请输入想要查询的主题:","希腊债务危机")
theme_name = input("请输入想要查询的主题:")
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option('prefs',
{'profile.managed_default_content_settings.images': 2}) # 禁止图片加载,加快速度
driver = webdriver.Chrome(chrome_options=chrome_options)
urls = get_urls(theme_name)
print('共' + str(len(urls)) + '条视频') # 统计视频数
get_content(urls)
driver.close()
selenium +excel
最新推荐文章于 2024-05-12 20:34:18 发布