Python爬取CCTV15

最新推荐文章于 2022-11-06 14:49:05 发布

双燕难凭远信

最新推荐文章于 2022-11-06 14:49:05 发布

阅读量338

点赞数

分类专栏： Python学习记录文章标签： python 爬虫 pycharm

本文链接：https://blog.csdn.net/CSDNsabo/article/details/120087053

版权

Python学习记录专栏收录该内容

26 篇文章 0 订阅

订阅专栏

1.下载视频-精彩音乐汇

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/8/31 22:37
# author:Sabo

from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from os import system
from os import listdir
from os import mkdir


def getvideoLinks(htmlSource):
    videoLinksList = []
    videoTitleList = []
    mainPage = BS(htmlSource, "html.parser")
    childPage = mainPage.find("div", attrs={"class": "timg"}).find_next("ul")
    aTags = childPage.find_all("a")
    for aTag in aTags:
        videoLinksList.append(aTag.get("lanmu1"))
    for aTag in aTags:
        videoTitleList.append(aTag.get("title"))
    return videoLinksList, videoTitleList


def switchToNowWindow(driver):
    window_handles = driver.window_handles
    driver.switch_to.window(window_handles[-1])
    return driver


def goToMainUrl(dstUrl, videoName):
    driver = webdriver.Chrome()
    driver.get(dstUrl)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver = switchToNowWindow(driver)
    driver.find_element_by_id("mytxtdafdfasdf").send_keys(videoName, Keys.ENTER)
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    return driver.page_source


# 利用os模块调用cmd利用you-get指令下载视频
def download(savePath, videoName, videoUrl):
    commond = 'you-get -o {0} -O {1} "{2}"'.format(savePath, videoName, videoUrl)
    print(commond)
    system(commond)


def downloadAll(savePath, videoLinks, videoTitle, keywords):
    download_flag = True

    dirs = listdir(path=savePath)
    for index in range(videoLinks.__len__()):
        download_flag = True
        for index_keywords in range(keywords.__len__()):
            if keywords[index_keywords] in videoTitle[index]:
                print("%s 不符合下载条件！" % videoTitle[index])
                download_flag = False
                break
            if download_flag and (videoTitle[index] + ".mp4") in dirs:
                print("%s 已经存在！" % videoTitle[index])
                download_flag = False
        if download_flag:
            download(savePath=savePath, videoName=videoTitle[index], videoUrl=videoLinks[index], )


def formatvideoTitle(videoTitles):
    for index in range(videoTitles.__len__()):
        videoTitles[index] = videoTitles[index].replace(" ", "-")
    return videoTitles


def get_main_url(page_suorce):
    video_links_list_ul = []
    main_page = BS(page_suorce, 'html.parser')
    child_page = main_page.find("div", attrs={"class": "lcon"}).find_all("ul")
    for temp in (child_page):
        aTags = temp.find_all("a")
        for aTag in aTags:
            video_links_list_ul.append(aTag.get("href"))
    # print(video_links_list_ul)
    return video_links_list_ul


def get_signal_download_link(dstUrl):
    driver = webdriver.Chrome()
    driver.get(dstUrl)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver = switchToNowWindow(driver)
    driver.find_element_by_class_name("li_cur").click()
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    # print(driver.page_source)
    return driver.page_source


def get_dst_link(page_source, print_flag):
    #
    videoLinksList = []
    videoTitleList = []
    aTags = []
    mainPage = BS(page_source, "html.parser")
    childPage = mainPage.find_all("div", attrs={"class": "text_box_141010"})
    # print(childPage)
    for childPage_a in childPage:
        aTag = childPage_a.find("p").find("a")
        aTags.append(aTag)
    for index in range(aTags.__len__()):
        videoLinksList.append(aTags[index].get("href"))
    for index in range(aTags.__len__()):
        videoTitleList.append(aTags[index].get("title"))
    if print_flag:
        print("videoLinksList:%s" % videoLinksList)
        print("videoTitleList:%s" % videoTitleList)
        # print("video_links_list_ul:%s" % video_links_list_ul)
    return videoLinksList, videoTitleList


def check_dir_exist(save_path, video_name):
    dirs = (listdir(save_path))
    if video_name not in dirs:
        print("%s不存在！稍后将创建~" % (save_path + video_name))
        mkdir(save_path + "/" + video_name)
        return False
    print("%s已经存在！无需创建~" % (save_path + video_name))
    return True


def main(videoName, savePath):
    keywords = ["会", "专辑", "旋律", "《精彩音乐汇》"]
    check_dir_exist(save_path=savePath, video_name=videoName)

    savePath += videoName
    # 央视官网首页
    mainUrl = "https://tv.cctv.com/index.shtml"
    page_source = goToMainUrl(dstUrl=mainUrl, videoName=videoName)
    # print(page_source)
    link_list = get_main_url(page_suorce=page_source)
    for index_link_list in range(link_list.__len__()):
        signal_links = get_signal_download_link(dstUrl=link_list[index_link_list].__str__())
        videoLinksList, videoTitleList = get_dst_link(page_source=signal_links, print_flag=False)
        # 网页处理数据
        # videoLinksList, videoTitleList = getvideoLinks(htmlSource=page_source)
        videoTitleList = formatvideoTitle(videoTitles=videoTitleList)
        downloadAll(savePath=savePath, videoLinks=videoLinksList, videoTitle=videoTitleList, keywords=keywords)


if __name__ == '__main__':
    # # 初始变量
    videoName = "精彩音乐汇"
    savePath = r"F:/"

    # videoName = str(input("请输入你想下载的视频名字:"))
    # savePath = str(input("请输入你要保存的盘符名字（例如D:或者F:等）"))
    # savePath+="/"

    # print(savePath)
    main(videoName=videoName, savePath=savePath)

2.生成对应的txt文件（用于存储标题和对应的链接）

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/8/31 22:37
# author:Sabo

from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from os import system
from os import listdir
from os import mkdir

def getvideoLinks(htmlSource):
    videoLinksList = []
    videoTitleList = []
    mainPage = BS(htmlSource, "html.parser")
    childPage = mainPage.find("div", attrs={"class": "timg"}).find_next("ul")
    aTags = childPage.find_all("a")
    for aTag in aTags:
        videoLinksList.append(aTag.get("lanmu1"))
    for aTag in aTags:
        videoTitleList.append(aTag.get("title"))
    return videoLinksList, videoTitleList


def switchToNowWindow(driver):
    window_handles = driver.window_handles
    driver.switch_to.window(window_handles[-1])
    return driver


def goToMainUrl(dstUrl, videoName):
    driver = webdriver.Chrome()
    driver.get(dstUrl)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver = switchToNowWindow(driver)
    driver.find_element_by_id("mytxtdafdfasdf").send_keys(videoName, Keys.ENTER)
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    return driver.page_source


# 利用os模块调用cmd利用you-get指令下载视频
def download(savePath, videoName, videoUrl):
    commond = 'you-get -o {0} -O {1} "{2}"'.format(savePath, videoName, videoUrl)
    print(commond)
    system(commond)


def downloadAll(savePath, videoLinks, videoTitle, keywords):
    download_flag = True

    dirs = listdir(path=savePath)
    for index in range(videoLinks.__len__()):
        download_flag = True
        for index_keywords in range(keywords.__len__()):
            if keywords[index_keywords] in videoTitle[index]:
                print("%s 不符合下载条件！" % videoTitle[index])
                download_flag = False
                break
            if download_flag and (videoTitle[index] + ".mp4") in dirs:
                print("%s 已经存在！" % videoTitle[index])
                download_flag = False
        if download_flag:
            download(savePath=savePath, videoName=videoTitle[index], videoUrl=videoLinks[index], )


def formatvideoTitle(videoTitles):
    for index in range(videoTitles.__len__()):
        videoTitles[index] = videoTitles[index].replace(" ", "-")
    return videoTitles


def get_main_url(page_suorce):
    video_links_list_ul = []
    main_page = BS(page_suorce, 'html.parser')
    child_page = main_page.find("div", attrs={"class": "lcon"}).find_all("ul")
    for temp in (child_page):
        aTags = temp.find_all("a")
        for aTag in aTags:
            video_links_list_ul.append(aTag.get("href"))
    # print(video_links_list_ul)
    return video_links_list_ul


def get_signal_download_link(dstUrl):
    driver = webdriver.Chrome()
    driver.get(dstUrl)
    driver.implicitly_wait(3)
    driver.maximize_window()
    driver = switchToNowWindow(driver)
    driver.find_element_by_class_name("li_cur").click()
    driver = switchToNowWindow(driver)
    # print(driver.current_url)
    # print(driver.page_source)
    return driver.page_source


def get_dst_link(page_source, print_flag):
    #
    videoLinksList = []
    videoTitleList = []
    aTags = []
    mainPage = BS(page_source, "html.parser")
    childPage = mainPage.find_all("div", attrs={"class": "text_box_141010"})
    # print(childPage)
    for childPage_a in childPage:
        aTag = childPage_a.find("p").find("a")
        aTags.append(aTag)
    for index in range(aTags.__len__()):
        videoLinksList.append(aTags[index].get("href"))
    for index in range(aTags.__len__()):
        videoTitleList.append(aTags[index].get("title"))
    if print_flag:
        print("videoLinksList:%s" % videoLinksList)
        print("videoTitleList:%s" % videoTitleList)
        # print("video_links_list_ul:%s" % video_links_list_ul)
    return videoLinksList, videoTitleList


def check_file_exist_in_dst_dir(dst_dir, dst_file):
    dirs = (listdir(dst_dir))
    # print(dirs)
    if dst_file not in dirs:
        print("《%s》文件不存在！稍后将创建~" % (dst_dir + "/" +dst_file))
        with open(file=dst_dir + "/" + dst_file, mode="w") as f:
            f.close()
        return False
    print("《%s》文件已经存在！无需创建~" % (dst_dir +"/" +dst_file))
    return True


def check_dir_exist(save_path, video_name):
    dirs = (listdir(save_path))
    if video_name not in dirs:
        print("《%s》文件夹 不存在！稍后将创建~" % (save_path + video_name).replace("./",""))
        mkdir(save_path + "/" + video_name)
        return False
    print("《%s》文件夹 已经存在！无需创建~" % (save_path + video_name))
    return True


def write_files(videoLinksList, videoTitleList, orignal_save_path):
    title_list = []
    check_file_exist_in_dst_dir(dst_dir=orignal_save_path.__str__()+"精彩音乐汇", dst_file="UrlAndTitle.txt")
    with open(file=orignal_save_path.__str__()+"精彩音乐汇/UrlAndTitle.txt", mode="r") as f:
        title_list = f.readlines()
        # print(title_list)
    for index in range(videoTitleList.__len__()):
        with open(file=orignal_save_path.__str__()+"精彩音乐汇/UrlAndTitle.txt", mode="a+") as f:
            # print(title_list)
            if ("Title   :   "+videoTitleList[index]+"\n") not in title_list:
                f.seek(0)
                f.write("Title   :   ")
                f.write(videoTitleList[index] + "\n")
                f.write("Link    :   ")
                f.write(videoLinksList[index] + "\n\n")
                print("Title    :   %s" % videoTitleList[index])
                print("Link    :   %s" % videoLinksList[index])
            else:
                print("《"+videoTitleList[index].strip().__str__()+"》已经存在！")


def main(videoName, savePath):
    keywords = ["会", "专辑", "旋律", "《精彩音乐汇》"]
    check_dir_exist(save_path=savePath, video_name=videoName)
    original_save_path = savePath
    savePath += videoName
    # 央视官网首页
    mainUrl = "https://tv.cctv.com/index.shtml"
    page_source = goToMainUrl(dstUrl=mainUrl, videoName=videoName)
    # print(page_source)
    link_list = get_main_url(page_suorce=page_source)
    for index_link_list in range(link_list.__len__()):
        signal_links = get_signal_download_link(dstUrl=link_list[index_link_list].__str__())
        videoLinksList, videoTitleList = get_dst_link(page_source=signal_links, print_flag=False)
        # 网页处理数据
        # videoLinksList, videoTitleList = getvideoLinks(htmlSource=page_source)
        videoTitleList = formatvideoTitle(videoTitles=videoTitleList)
        write_files(videoTitleList=videoTitleList, videoLinksList=videoLinksList, orignal_save_path=original_save_path)
    #  测试
    # write_files(videoTitleList=["a"], videoLinksList=["34"])
    # downloadAll(savePath=savePath, videoLinks=videoLinksList, videoTitle=videoTitleList, keywords=keywords)


if __name__ == '__main__':
    # # 初始变量
    videoName = "精彩音乐汇"
    savePath = r"F:/"

    # videoName = str(input("请输入你想下载的视频名字:"))
    # savePath = str(input("请输入你要保存的盘符名字（例如D:或者F:等）"))
    # savePath+="/"

    # print(savePath)
    main(videoName=videoName, savePath=savePath)

3.用于在txt文件中通过输入快速找到相关的链接

# !/usr/bin/env python
# -*-coding:utf-8-*-
# date :2021/8/31 0:22
# author:Sabo
# from os import listdir
def get_dirs_from_path_and_name(file_path_and_name):
    dirs = []
    with open(file=file_path_and_name, mode="r") as f:
        dir = f.readlines()
        for index in range(dir.__len__()):
            dirs.append(dir[index].strip().__str__())
    return dirs
    # dirs = listdir(r"F:/乐享汇")
    # print(dirs)

def from_title_find_link_in_dirs(dirs, mp4_name):
    if ("Title   :   "+mp4_name) in dirs:
        for index in range(dirs.__len__()):
            if ("Title   :   "+mp4_name) == dirs[index]:
                print("%s" % dirs[index+1])
                break
    else:
        print("%s不在文件中!" % mp4_name)


def main():
    dir_path = "F:/精彩音乐汇/UrlAndTitle.txt"
    while True:
        dirs_list = get_dirs_from_path_and_name(file_path_and_name=dir_path)
        mp4_name = input("MP4 name:")
        from_title_find_link_in_dirs(dirs=dirs_list, mp4_name=mp4_name)


if __name__ == '__main__':
    main()