爬取CCTV15

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/8/30 12:36
# author:Sabo
# CCTV官网:https://tv.cctv.com/index.shtml

from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from os import system


def get_main_links(htmlSource):
    video_links_list_ul = []
    mainPage = BS(htmlSource, "html.parser")
    childPage = mainPage.find("div", attrs={"class": "lcon"}).find_all("ul")
    for temp in (childPage):
        aTags = temp.find_all("a")
        for aTag in aTags:
            video_links_list_ul.append(aTag.get("href"))
    return video_links_list_ul


def getvideoLinks(htmlSource, print_flag):
    #
    videoLinksList = []
    videoTitleList = []
    aTags = []
    mainPage = BS(htmlSource, "html.parser")
    childPage = mainPage.find_all("div", attrs={"class": "text_box_141010"})
    # print(childPage)
    for childPage_a in childPage:
        aTag = childPage_a.find("p").find("a")
        aTags.append(aTag)
        print("aTag:",aTag)
        print("aTag href:",aTag.get("href"))
        print("aTag title:", aTag.get("title"))
    for index in range(aTags.__len__()):
        videoLinksList.append(aTags[index].get("href"))
    for index in range(aTags.__len__()):
        videoTitleList.append(aTags[index].get("title"))
    if print_flag:
        # print("videoLinksList:%s" % videoLinksList)
        print("videoTitleList:%s" % videoTitleList)
        # print("video_links_list_ul:%s" % video_links_list_ul)
    return videoLinksList, videoTitleList


def switchToNowWindow(driver):
    window_handles = driver.window_handles
    driver.switch_to.window(window_handles[-1])
    return driver


def goToMainUrl(dstUrl, videoName):
    driver = webdriver.Chrome()
    driver.get(dstUrl)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver = switchToNowWindow(driver)
    driver.find_element_by_id("mytxtdafdfasdf").send_keys(videoName, Keys.ENTER)
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    # input()
    return driver.page_source


# 利用os模块调用cmd利用you-get指令下载视频
def download(savePath, videoName, videoUrl):
    commond = 'you-get -o {0} -O {1} "{2}"'.format(savePath, videoName, videoUrl)
    # print(commond)
    system(commond)


def downloadAll(savePath, videoLinks, videoTitle, keywords):
    download_flag = True
    for index in range(videoLinks.__len__()):
        download_flag = True
        for index_keywords in range(keywords.__len__()):
            if keywords[index_keywords] in videoTitle[index]:
                download_flag = False
        if  download_flag:
            print("Link:%s"%videoLinks[index])
            print("Name:%s"%videoTitle[index])
            download(savePath=savePath, videoName=videoTitle[index], videoUrl=videoLinks[index], )


def formatvideoTitle(videoTitles):
    for index in range(videoTitles.__len__()):
        videoTitles[index] = videoTitles[index].replace(" ", "-")
    return videoTitles


def from_signal_get_all(dst_url):
    driver = webdriver.Chrome()
    driver.get(dst_url)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver.find_element_by_class_name("li_cur").click()
    driver = switchToNowWindow(driver)
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    # input()
    return driver.page_source


def main(videoName, savePath):
    keywords = ["会", "专辑", "旋律"]
    savePath += videoName
    # 央视官网首页
    mainUrl = "https://tv.cctv.com/index.shtml"
    page_source = goToMainUrl(dstUrl=mainUrl, videoName=videoName)
    # print(page_source)
    # 网页处理数据
    video_main_links = get_main_links(htmlSource=page_source)
    # print("video_main_links: %s" % video_main_links)
    for index in range(video_main_links.__len__()):
        signal_page_html = from_signal_get_all(dst_url=video_main_links[index].__str__())
        # print(signal_page_html)
        videoLinksList, videoTitleList = getvideoLinks(htmlSource=signal_page_html, print_flag=True)
        print(videoTitleList)
        videoTitleList = formatvideoTitle(videoTitles=videoTitleList)
        downloadAll(savePath=savePath, videoLinks=videoLinksList, videoTitle=videoTitleList, keywords=keywords)


if __name__ == '__main__':
    # # 初始变量
    videoName = "乐享汇"
    savePath = r"F:/"

    # videoName = str(input("请输入你想下载的视频名字:"))
    # savePath = str(input("请输入你要保存的盘符名字(例如D:或者F:等)"))
    # savePath+="/"

    # print(savePath)
    main(videoName=videoName, savePath=savePath)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值