获取斗鱼当前直播现状(selenium,lxml和显示等待)

最新推荐文章于 2021-03-26 17:46:10 发布

山与路

最新推荐文章于 2021-03-26 17:46:10 发布

阅读量2.9k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/a1309525802/article/details/108491427

版权

爬虫专栏收录该内容

14 篇文章 0 订阅

订阅专栏

获取斗鱼当前直播现状

import os
import time

from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By #找[某个页面元素] --定义[定位器]的主要东西
from selenium.webdriver.support.wait import WebDriverWait #显示等待对象
from selenium.webdriver.support import expected_conditions as EC #  [某个条件为止]
from project.excel_utils.excel_utils import Excel_Utils


def parse_page(li_list,dir,filename,flag):
    if not os.path.exists(dir):
        os.makedirs(dir)
        dic_list=[{'title':'标题','anchor':'主播','game':'游戏','hot':'热度','label_wrap':'标签'}]
    else:
        dic_list=[]
    for li in li_list:
        dic={}
        #标题
        title=li.xpath(".//h3/text()")
        #主播
        anchor=li.xpath('.//div[@class="DyListCover-userName"]/text()')
        #游戏
        game = li.xpath('.//span[@class="DyListCover-zone"]/text()')
        #热度
        hot = li.xpath('.//span[@class="DyListCover-hot"]/text()')
        #标签
        label_wrap=li.xpath('.//span[@class="HeaderCell-label-wrap is-od"]/text()')
        if label_wrap:
           label_wrap=label_wrap[0]
        else:
            label_wrap="该主播暂时没有描述"
        if len(anchor):
            dic={'title':title[0],
                 'anchor':anchor[0],
                 'game':game[0],
                 'hot':hot[0],
                 'label_wrap':label_wrap
            }
            dic_list.append(dic)
    if flag:
        Excel_Utils.add_to_excel(dic_list,filename)
    else:
        Excel_Utils.write_to_excel(dic_list, filename)
#获取
def main():
    # 判断是新建还是追加
    flag = False
    #这里只获取主机游戏的现状
    url = "https://www.douyu.com/g_TVgame"
    #打开浏览器
    driver.get(url)
    driver.maximize_window()
    # 当该页的所有准备加载完后在执行下面的操作
    wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/div[2]/ul/li')))
    while True:
        #这里可能会有人想为什么不加下面这行代码,因为这里的点击下一页是重定向,所以当使用wait.until时他会以为li已经加载完,但其实li还在加载中
       # wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/div[2]/ul/li'))) #错误
       tree=etree.HTML(driver.page_source)
       li_list=tree.xpath('//ul[@class="layout-Cover-list"]/li')
       dir="斗鱼/单机热游/主机游戏"
       filename=dir+"/"+str(time.strftime("%Y-%m-%d %H",time.localtime()))+"点.xls"
       #当也存在该文件,则先删除在新创
       if os.path.exists(filename) and not flag:
           os.remove(filename)
       parse_page(li_list,dir=dir,filename=filename,flag=flag)
       next_page=driver.find_element_by_xpath('//ul[@class="dy-Pagination ListPagination"]/li[last()]')
       aria_disabled=next_page.get_attribute("aria-disabled")
       if aria_disabled=="true":
           break
       next_page.click()
       flag=True
       time.sleep(5)
    driver.close()
if __name__ == '__main__':
    driver=webdriver.Chrome()
    # 显示等待的对象
    # driver:等待对象监听那个浏览器
    # 20 最大等待时长
    wait=WebDriverWait(driver,20)
    main()

project.excel_utils.excel_utils.py

import xlwt
import xlrd
from xlutils.copy import copy


class Excel_Utils:
    @staticmethod
    def write_to_excel(infos,filename,sheetname='sheet1'):
        #创建工作簿
        work_book=xlwt.Workbook(encoding="utf-8")
        #创建一个sheet
        sheet=work_book.add_sheet(sheetname)
        #写标头
        head=list(infos[0].keys())
        for row in range(len(head)):
            sheet.write(0,row,infos[0][head[row]])
        #写内容
        for row in range(1,len(infos)):
            for col in range(len(head)):
                sheet.write(row,col,infos[row][head[col]])
        work_book.save(filename)
        print("存储成功!!")

    @staticmethod
    def add_to_excel(infos,filename,sheetname='sheet1'):
        #打开excel
        work_book=xlrd.open_workbook(filename=filename)
        #获取所有表单的名称
        sheet_name_list=work_book.sheet_names()
        #通过名字来获取指定的sheet表单
        sheet=work_book.sheet_by_name(sheet_name_list[0])
        #读取行数
        old_rows=sheet.nrows
        #将xlrd的workbook变成xlwt的workbook
        new_work_book=copy(work_book)
        new_sheet=new_work_book.get_sheet(0)
        #写入
        head=list(infos[0].keys())
        for row in range(len(infos)):
            for col in range(len(head)):
                new_sheet.write(row+old_rows,col,infos[row][head[col]])
        new_work_book.save(filename)
        print("追加成功!!!")

山与路

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
获取斗鱼当前直播现状(selenium,lxml和显示等待)

获取斗鱼当前直播现状import osimport timefrom selenium import webdriverfrom lxml import etreefrom selenium.webdriver.common.by import By #找[某个页面元素] --定义[定位器]的主要东西from selenium.webdriver.support.wait import WebDriverWait #显示等待对象from selenium.webdriver.support
复制链接

扫一扫