selenium +excel

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from lxml import etree
import xlwt


def set_style(name,height,bold=False):
    style = xlwt.XFStyle()
    font = xlwt.Font()
    font.name = name
    font.bold = bold
    font.color_index = 4
    font.height = height
    style.font = font
    return style


def clear_page():
    """
    功能:下拉到底端时,视频以小窗形式播放,关闭
    """
    try:
        driver.find_element('//*[@id="bili-search"]/div[7]/div/div/div[1]/svg').click()  # 找到x按钮,关闭视频
    except:
        pass


def Transfer(driver):
    '''
    下拉滑动条到页面底端,实现翻页功能
    '''
    try:
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")  ## 移动到页面最底部
        time.sleep(1)
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
        time.sleep(1)
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
    except:
        pass
    return "Transfer successfully \n"


def get_urls(name):
    '''
    参数:需要爬取的主题名称
    功能:获取与主题相关的所有视频的url
    '''
    urls = []
    print('开始爬取评论')
    driver.get(f'https://search.bilibili.com/all?keyword={name}&page=1')  # 打开第一页的视频
    page_text = driver.page_source  # 获取当前页面的源码
    tree = etree.HTML(page_text)
    try:
        pages = int(
            tree.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div[3]/div/ul/li[8]/button/text()')[0])  # 找出有多少页视频
    except:
        pages = int(tree.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div[3]/div/ul/li[4]/button/text()')[0])
    print(f"当前视频共有{pages}页")
    for page in range(1, pages + 1):
        print(f"爬取第{page}页url")
        driver.get(f'https://search.bilibili.com/all?keyword={name}&page={page}')
        Transfer(driver)
        page_text = driver.page_source
        tree = etree.HTML(page_text)
        try:
            if page == 1:
                for i in range(1, 1000):
                    url = "https:" + (
                    tree.xpath(f'/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul[2]/li[{i}]/a/@href')[0])  # 第一页视频方式
                    urls.append(url)
            else:
                for i in range(1, 1000):
                    url = "https:" + (
                    tree.xpath(f'//*[@id="all-list"]/div[1]/ul/li[{i}]/div/div[1]/a/@href')[0])  # 其他页视频获取方式
                    urls.append(url)
        except:
            pass
    print("共" + str(pages) + "页视频")
    print('urls爬取完毕')
    return urls


def get_content(urls):
    '''
    :param urls: 从urls中获取每个视频的url,获取其评论,用户名,评论时间等信息
    :return: 将其写入数据库中
    '''
    uesr_name_list = []
    uesr_mid_list = []
    uesr_time_list = []
    uesr_comment_list = []
    print('开始爬取视频评论')
    video_sum = 1
    for url in urls:
        print(url)
        print('第' + str(video_sum) + "条视频")
        video_sum = video_sum + 1
        driver.get(url)
        Transfer(driver)  # 下拉,以看到评论总数
        time.sleep(4)
        try:
            time.sleep(10)
            comment_sum = driver.find_element_by_css_selector(
                '#comment > div > div.b-head > span.b-head-t.results').text

            if comment_sum != "":
                print(comment_sum)
            else:
                comment_sum = 0
        except:
            comment_sum = 0
        try:
            video_title = driver.find_element_by_css_selector('#viewbox_report > h1 > span').text
            viedo_like = driver.find_element_by_css_selector('#arc_toolbar_report > div.ops > span.like').get_attribute(
                "title")
            viedo_coin = driver.find_element_by_css_selector('#arc_toolbar_report > div.ops > span.coin').get_attribute(
                "title")
            viedo_collect = driver.find_element_by_css_selector(
                '#arc_toolbar_report > div.ops > span.collect').get_attribute(
                "title")
        except:
            pass
        print(viedo_like)
        print(video_title)
        print(viedo_coin)
        print(viedo_collect)
        print("当前视频下共" + str(comment_sum) + "评论")
        print("---" * 20)
        for i in range(int(int(comment_sum) / 20) + 3):
            Transfer(driver)
        for e in range(1, int(comment_sum)):
            try:
                uesr_mid = driver.find_element_by_css_selector(
                    f"#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.user > a.name").get_attribute(
                    "data-usercard-mid")
            except:
                uesr_mid = ''
                break
            try:
                user_name = driver.find_element_by_css_selector(
                    f'#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.user > a.name').text
                # /html/body/div[2]/div[4]/div[1]/div[6]/div/div[2]/div/div[4]/div[1]/div[2]/div[1]/a[1]
                ##comment > div > div.comment > div > div.comment-list > div:nth-child(17) > div.con > div.user > a.name
                #
            except:
                user_name = ''
            try:
                comment_text = driver.find_element_by_xpath(
                    f'//*[@id="comment"]/div/div[2]/div/div[4]/div[{e}]/div[2]/p').text
            except:
                comment_text = ''
            try:
                comment_time = driver.find_element_by_css_selector(
                    f'#comment > div > div.comment > div > div.comment-list > div:nth-child({e}) > div.con > div.info > span.time').text
            except:
                comment_time = ''


            uesr_name_list.append(user_name)
            uesr_mid_list.append(uesr_mid)
            uesr_time_list.append(uesr_time_list)
            uesr_comment_list.append(comment_text)
            write_excel(i=video_sum,name=uesr_name_list,mid=uesr_mid,date=uesr_time_list,commment=uesr_comment_list)
        print(uesr_name_list)
        print(uesr_mid_list)
        print(uesr_time_list)
        print(uesr_comment_list)
        print("---" * 10)





def set_style(name,height,bold=False):
    style = xlwt.XFStyle()
    font = xlwt.Font()
    font.name = name
    font.bold = bold
    font.color_index = 4
    font.height = height
    style.font = font
    return style
#写Excel

def write_excel(i,name,mid,date,commment):

    f = xlwt.Workbook()

    sheet1 = f.add_sheet(f'{i}',cell_overwrite_ok=True)

    row0 = ["用户名","mid","评论日期","评论"]

    colum0 = name
    colum1 = mid
    colum2 = date
    colum3 = commment
    #写第一行

    for i in range(0,len(row0)):

        sheet1.write(0,i,row0[i],set_style('Times New Roman',220,True))

    #写第一列

    for i in range(0,len(colum0)):

        sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
    for i in range(0,len(colum1)):

        sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
    for i in range(0,len(colum2)):

        sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))
    for i in range(0,len(colum3)):

        sheet1.write(i+1,0,colum0[i],set_style('Times New Roman',220,True))




    f.save('C:\\Users\\Administrator\\Desktop\\test.xls')






def sum_id():
    '''
    :return: 返回评论的总数
    '''
    global sum
    sum = sum + 1
    return sum



if __name__ == '__main__':
    sum = 0
    # theme_name = input("请输入想要查询的主题:","希腊债务危机")
    theme_name = input("请输入想要查询的主题:")
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option('prefs',
                                           {'profile.managed_default_content_settings.images': 2})  # 禁止图片加载,加快速度
    driver = webdriver.Chrome(chrome_options=chrome_options)
    urls = get_urls(theme_name)
    print('共' + str(len(urls)) + '条视频')  # 统计视频数
    get_content(urls)
    driver.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

menike3

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值