获取斗鱼当前直播现状
import os
import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By #找[某个页面元素] --定义[定位器]的主要东西
from selenium.webdriver.support.wait import WebDriverWait #显示等待对象
from selenium.webdriver.support import expected_conditions as EC # [某个条件为止]
from project.excel_utils.excel_utils import Excel_Utils
def parse_page(li_list,dir,filename,flag):
if not os.path.exists(dir):
os.makedirs(dir)
dic_list=[{'title':'标题','anchor':'主播','game':'游戏','hot':'热度','label_wrap':'标签'}]
else:
dic_list=[]
for li in li_list:
dic={}
#标题
title=li.xpath(".//h3/text()")
#主播
anchor=li.xpath('.//div[@class="DyListCover-userName"]/text()')
#游戏
game = li.xpath('.//span[@class="DyListCover-zone"]/text()')
#热度
hot = li.xpath('.//span[@class="DyListCover-hot"]/text()')
#标签
label_wrap=li.xpath('.//span[@class="HeaderCell-label-wrap is-od"]/text()')
if label_wrap:
label_wrap=label_wrap[0]
else:
label_wrap="该主播暂时没有描述"
if len(anchor):
dic={'title':title[0],
'anchor':anchor[0],
'game':game[0],
'hot':hot[0],
'label_wrap':label_wrap
}
dic_list.append(dic)
if flag:
Excel_Utils.add_to_excel(dic_list,filename)
else:
Excel_Utils.write_to_excel(dic_list, filename)
#获取
def main():
# 判断是新建还是追加
flag = False
#这里只获取主机游戏的现状
url = "https://www.douyu.com/g_TVgame"
#打开浏览器
driver.get(url)
driver.maximize_window()
# 当该页的所有准备加载完后在执行下面的操作
wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/div[2]/ul/li')))
while True:
#这里可能会有人想为什么不加下面这行代码,因为这里的点击下一页是重定向,所以当使用wait.until时他会以为li已经加载完,但其实li还在加载中
# wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/div[2]/ul/li'))) #错误
tree=etree.HTML(driver.page_source)
li_list=tree.xpath('//ul[@class="layout-Cover-list"]/li')
dir="斗鱼/单机热游/主机游戏"
filename=dir+"/"+str(time.strftime("%Y-%m-%d %H",time.localtime()))+"点.xls"
#当也存在该文件,则先删除在新创
if os.path.exists(filename) and not flag:
os.remove(filename)
parse_page(li_list,dir=dir,filename=filename,flag=flag)
next_page=driver.find_element_by_xpath('//ul[@class="dy-Pagination ListPagination"]/li[last()]')
aria_disabled=next_page.get_attribute("aria-disabled")
if aria_disabled=="true":
break
next_page.click()
flag=True
time.sleep(5)
driver.close()
if __name__ == '__main__':
driver=webdriver.Chrome()
# 显示等待的对象
# driver:等待对象监听那个浏览器
# 20 最大等待时长
wait=WebDriverWait(driver,20)
main()
project.excel_utils.excel_utils.py
import xlwt
import xlrd
from xlutils.copy import copy
class Excel_Utils:
@staticmethod
def write_to_excel(infos,filename,sheetname='sheet1'):
#创建工作簿
work_book=xlwt.Workbook(encoding="utf-8")
#创建一个sheet
sheet=work_book.add_sheet(sheetname)
#写标头
head=list(infos[0].keys())
for row in range(len(head)):
sheet.write(0,row,infos[0][head[row]])
#写内容
for row in range(1,len(infos)):
for col in range(len(head)):
sheet.write(row,col,infos[row][head[col]])
work_book.save(filename)
print("存储成功!!")
@staticmethod
def add_to_excel(infos,filename,sheetname='sheet1'):
#打开excel
work_book=xlrd.open_workbook(filename=filename)
#获取所有表单的名称
sheet_name_list=work_book.sheet_names()
#通过名字来获取指定的sheet表单
sheet=work_book.sheet_by_name(sheet_name_list[0])
#读取行数
old_rows=sheet.nrows
#将xlrd的workbook变成xlwt的workbook
new_work_book=copy(work_book)
new_sheet=new_work_book.get_sheet(0)
#写入
head=list(infos[0].keys())
for row in range(len(infos)):
for col in range(len(head)):
new_sheet.write(row+old_rows,col,infos[row][head[col]])
new_work_book.save(filename)
print("追加成功!!!")