【Python~分享】爬取 mp4 格式视频

最新推荐文章于 2024-08-19 03:11:17 发布

猪猪传奇

最新推荐文章于 2024-08-19 03:11:17 发布

阅读量3.8k

点赞数

分类专栏： Python 学习

本文链接：https://blog.csdn.net/qq_42127861/article/details/86595108

版权

Python 学习专栏收录该内容

20 篇文章 2 订阅

订阅专栏

注：由于我进行的并不是规范爬虫，每次爬都是有点心虚，所以下手对象也是一些不法网站QAQ，其中涉及敏感信息，所以就不给出网站URL，哈哈

import requests
from bs4 import BeautifulSoup
import time
import socket
import os
import re
import bs4


def getText(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header, stream=True)
        r.raise_for_status()
        return r.content
    except:
        print("申请视频错误")


def writeFile(path, content):
    if not os.path.exists(path):
        with open(path, "wb") as file:
            file.write(content)
            file.flush()
    else:
        pass


def dealUrl(url, dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    str = url.split("/")
    ss = str[-1].split(".")
    mv = str[-2] + "_" + ss[0] + ".mp4"
    path = dir + mv
    return path


def getHtml(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        print("申请网站错误")
        return r.status_code


def getMvUrl(html):
    soup = BeautifulSoup(html, "html.parser")
    a_search = soup("a", href=re.compile("/all/"))
    mvs = []
    for a in a_search:
        if isinstance(a, bs4.element.Tag):
            if not a.has_attr('rel') and len(a(lambda x: x.name != None)) != 0:
                if a["href"] not in mvs:
                    mvs.append(a["href"])
    return mvs


def dealMvUrl(mvs, url):
    strs = url.split("//")
    mv_paths = []
    for mvitem in mvs:
        strs[-1] = mvitem
        each = strs[0] + "//" + strs[1] + strs[2]
        mv_paths.append(each)
    return mv_paths


def getVideoUrl(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        video = soup.find("div", {"id": "vod"})
        if isinstance(video, bs4.element.Tag):
            src = video.string.split("$")
        return src[-2]
    except:
        print("申请视频页面错误")


def getPages(html):  # 获取子页面路径
    pages = []
    soup = BeautifulSoup(html, "html.parser")
    menus = soup.find("div", {"class": "menu on"})
    uls = menus("a", href=re.compile("/all/"))
    for a in uls:
        if isinstance(a, bs4.element.Tag):
            pages.append(a["href"])
    return pages


def dealPageUrl(pages, url):  # 规范化每个子页面HTTP申请，爬取第2-12页
    abs = []
    str = url.split("/")
    for page in pages:  # 对于每一个从主站获取来的页面，都添加前12页
        i = str[0] + "//" + str[2] + page
        ii = i.split(".")
        for index in range(2, 13):
            iii = "{0}.{1}.{2}{3}.{4}".format(ii[0], ii[1], ii[2], index, ii[3])
            abs.append(iii)
    return abs


if __name__ == '__main__':

    dir_url = "D:\\mp4_MV\\"  # 存储文件夹路径
    url = "http://www.****.com//index.html"
    mvUrls = []  # 定义视频路径列表
    mv_num = 0
    now_op=1
    html_main = getHtml(url)  # 获取主网站结构
    pageUrls = getPages(html_main)  # 获取子页面路径

    abs_page_url = dealPageUrl(pageUrls, url)  # 获取每个页面的http申请#此时就应该获取所有页面申请

    for page in abs_page_url:  # 依次对每个页面进行操作：
        html = getHtml(page)  # 获取每个子页面结构
        if html == 404:  # 处理申请404
            print(html)
            continue

        mvUrls = getMvUrl(html)  # 获取每个子页面所有mv相对路径

        True_mv_path = dealMvUrl(mvUrls, url)  # 获取mv页面HTTP路径
        mv_num += len(True_mv_path)

        for mvitem in True_mv_path:  # 对应每一个页面路径，获得video的src
            try:
                print("开始第-"+str(now_op)+"-个")
                time.sleep(1)
                src = getVideoUrl(mvitem)
                path = dealUrl(src, dir_url)  # 获取每一个mv本地存储路径#并处理文件夹存在与否
                content = getText(src)
                writeFile(path, content)  # 写入本地文件
                print("第"+str(now_op)+"-个-this_ok--" + path)
                now_op+=1
            except:
                print("第-"+str(now_op)+"-个-this--" + path + "-出现错误")
                now_op+=1
                continue

    print("all_ok")
    print("{:-^20}".format("共有_" + str(mv_num) + "_个视频"))