# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/8/30 12:36
# author:Sabo
# CCTV官网:https://tv.cctv.com/index.shtml
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from os import system
def get_main_links(htmlSource):
video_links_list_ul = []
mainPage = BS(htmlSource, "html.parser")
childPage = mainPage.find("div", attrs={"class": "lcon"}).find_all("ul")
for temp in (childPage):
aTags = temp.find_all("a")
for aTag in aTags:
video_links_list_ul.append(aTag.get("href"))
return video_links_list_ul
def getvideoLinks(htmlSource, print_flag):
#
videoLinksList = []
videoTitleList = []
aTags = []
mainPage = BS(htmlSource, "html.parser")
childPage = mainPage.find_all("div", attrs={"class": "text_box_141010"})
# print(childPage)
for childPage_a in childPage:
aTag = childPage_a.find("p").find("a")
aTags.append(aTag)
print("aTag:",aTag)
print("aTag href:",aTag.get("href"))
print("aTag title:", aTag.get("title"))
for index in range(aTags.__len__()):
videoLinksList.append(aTags[index].get("href"))
for index in range(aTags.__len__()):
videoTitleList.append(aTags[index].get("title"))
if print_flag:
# print("videoLinksList:%s" % videoLinksList)
print("videoTitleList:%s" % videoTitleList)
# print("video_links_list_ul:%s" % video_links_list_ul)
return videoLinksList, videoTitleList
def switchToNowWindow(driver):
window_handles = driver.window_handles
driver.switch_to.window(window_handles[-1])
return driver
def goToMainUrl(dstUrl, videoName):
driver = webdriver.Chrome()
driver.get(dstUrl)
driver.implicitly_wait(3)
driver.maximize_window()
driver = switchToNowWindow(driver)
driver.find_element_by_id("mytxtdafdfasdf").send_keys(videoName, Keys.ENTER)
driver = switchToNowWindow(driver)
# print(driver.current_url)
# input()
return driver.page_source
# 利用os模块调用cmd利用you-get指令下载视频
def download(savePath, videoName, videoUrl):
commond = 'you-get -o {0} -O {1} "{2}"'.format(savePath, videoName, videoUrl)
# print(commond)
system(commond)
def downloadAll(savePath, videoLinks, videoTitle, keywords):
download_flag = True
for index in range(videoLinks.__len__()):
download_flag = True
for index_keywords in range(keywords.__len__()):
if keywords[index_keywords] in videoTitle[index]:
download_flag = False
if download_flag:
print("Link:%s"%videoLinks[index])
print("Name:%s"%videoTitle[index])
download(savePath=savePath, videoName=videoTitle[index], videoUrl=videoLinks[index], )
def formatvideoTitle(videoTitles):
for index in range(videoTitles.__len__()):
videoTitles[index] = videoTitles[index].replace(" ", "-")
return videoTitles
def from_signal_get_all(dst_url):
driver = webdriver.Chrome()
driver.get(dst_url)
driver.implicitly_wait(3)
driver.maximize_window()
driver.find_element_by_class_name("li_cur").click()
driver = switchToNowWindow(driver)
driver = switchToNowWindow(driver)
# print(driver.current_url)
# input()
return driver.page_source
def main(videoName, savePath):
keywords = ["会", "专辑", "旋律"]
savePath += videoName
# 央视官网首页
mainUrl = "https://tv.cctv.com/index.shtml"
page_source = goToMainUrl(dstUrl=mainUrl, videoName=videoName)
# print(page_source)
# 网页处理数据
video_main_links = get_main_links(htmlSource=page_source)
# print("video_main_links: %s" % video_main_links)
for index in range(video_main_links.__len__()):
signal_page_html = from_signal_get_all(dst_url=video_main_links[index].__str__())
# print(signal_page_html)
videoLinksList, videoTitleList = getvideoLinks(htmlSource=signal_page_html, print_flag=True)
print(videoTitleList)
videoTitleList = formatvideoTitle(videoTitles=videoTitleList)
downloadAll(savePath=savePath, videoLinks=videoLinksList, videoTitle=videoTitleList, keywords=keywords)
if __name__ == '__main__':
# # 初始变量
videoName = "乐享汇"
savePath = r"F:/"
# videoName = str(input("请输入你想下载的视频名字:"))
# savePath = str(input("请输入你要保存的盘符名字(例如D:或者F:等)"))
# savePath+="/"
# print(savePath)
main(videoName=videoName, savePath=savePath)
爬取CCTV15
最新推荐文章于 2024-08-22 14:43:56 发布