爬虫-bilibili视频下载

获取视频连接

import requests
from requests import RequestException
import bs4
from bs4 import BeautifulSoup
import os
import re

def get_html(url):
    try:
        # proxies = {"http": "36.25.243.51", "http": "39.137.95.70", "http": "59.56.28.199"}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        # response = requests.get(url, headers = headers, proxies = proxies)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print('response.text:' + response.text)
            return response.text
    except RequestException:
        print('请求html错误:')

url = "https://www.bilibili.com/v/popular/rank/all"
text = get_html(url)
videos = re.findall(r"[w]{3}\.\w+\.com/video/BV\w+",text)

print(videos)

获取视频链接及详细信息

import requests
from bs4 import BeautifulSoup
import bs4
from pyquery import PyQuery as pq
from requests import RequestException
import re

def get_html(url):
    try:
        # proxies = {"http": "36.25.243.51", "http": "39.137.95.70", "http": "59.56.28.199"}
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        # response = requests.get(url, headers = headers, proxies = proxies)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print('response.text:' + response.text)
            return response.text
    except RequestException:
        print('请求html错误:')


def get_datas(html):
    soup = bs4.BeautifulSoup(html, 'lxml')
    # animesul = soup.find("ul", class_ ="rank-list")
    # animes_lis = animesul.find_all("li", class_ = "rank-item")
    print('soup------------')
    # animes=soup.find_all(attrs={'class':'rank-item'})
    animes = soup.find_all("li", class_="rank-item")
    data = []
    print('animes------------')
    # print(animes)

    for anime in animes:
        print(anime)
        title = anime.find('div', 'info').a.string
        link = anime.find('div', 'info').a['href']
        rank = anime.find('div', 'num').string
        # updata = anime.find('div', 'pgc-info').string
        score = anime.find('div','pts').text
        play = anime.find_all('span', class_='data-box')[0].text
        view = anime.find_all('span', class_='data-box')[1].text
        fav = anime.find_all('span', class_='data-box')[2].text
        # print("title,link,rank,score,play,view,fav:" + title, link, rank, score, play, view, fav)
        data.extend([rank, title, score, play, view, fav, link])

    return data

def main():
    url = "https://www.bilibili.com/v/popular/rank/all"
    text = get_html(url)
    datas = get_datas(text)


if __name__ == "__main__":
    main()

下载视频

"""
https://blog.csdn.net/qq_45695453/article/details/105757919
https://github.com/inspurer/PythonSpider
"""
import requests
import re
import json
from contextlib import closing
from pyquery import PyQuery as pq
from requests import RequestException
class bilibili():
    def __init__(self):
        self.getHtmlHeaders={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q = 0.9'
        }

        self.downloadVideoHeaders={
            'Origin': 'https://www.bilibili.com',
            'Referer': 'https://www.bilibili.com/video/av26522634',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }

    #一般这里得到的网页源码和F12查看看到的不一样,因为F12开发者工具里的源码经过了浏览器的解释
    def getHtml(self,url):
        try:
            response = requests.get(url=url, headers= self.getHtmlHeaders)
            print(response.status_code)
            if response.status_code == 200:
                return response.text
        except RequestException:
            print('请求Html错误:')

    def parseHtml(self,html):
        #用pq解析得到视频标题
        doc = pq(html)
        video_title = doc('#viewbox_report > h1 > span').text()

        #用正则、json得到视频url;用pq失败后的无奈之举
        pattern = r'\<script\>window\.__playinfo__=(.*?)\</script\>'
        result = re.findall(pattern, html)[0]
        temp = json.loads(result)
        #temp['durl']是一个列表,里面有很多字典
        #video_url = temp['durl']
        # for item in temp['data']['dash']['video'][0]:
        #     video_url = item['base_url']
            # if 'base_url' in item.keys():
            #     video_url = item['base_url']
            #     break
        video_url = temp['data']['dash']['video'][0]['base_url']
        #print(video_url)
        return{
            'title': video_title,
            'url': video_url
        }

    def download_video(self,video):
        title = re.sub(r'[\/:*?"<>|]', '-', video['title'])  # 去掉创建文件时的非法字符
        url = video['url']
        filename = title +'.flv'
        with open(filename, "wb") as f:
            f.write(requests.get(url=url, headers=self.downloadVideoHeaders, stream=True, verify=False).content)

        #closing适用于提供了 close() 实现的对象,比如网络连接、数据库连接
        # with closing(requests.get(video['url'], headers=self.downloadVideoHeaders, stream=True, verify=False)) as res:
        #     if res.status_code == 200:
        #         with open(filename, "wb") as f:
        #             for chunk in res.iter_content(chunk_size=1024):
        #                 if chunk:
        #                     f.write(chunk)

    def run(self,url):
        self.download_video(self.parseHtml(self.getHtml(url)))

if __name__ == '__main__':
    url = 'https://www.bilibili.com/video/av18100312'
    bilibili().run(url)

下载视频(简化版)

import requests
import os
from bs4 import BeautifulSoup
import json
import re
import warnings

downloadVideoHeaders={
    'Origin': 'https://www.bilibili.com',
    'Referer': 'https://www.bilibili.com/video/av26522634',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}

def download_video(url):
    r = requests.get(url)
    if r.status_code != 200:
        print("error")
        exit(-1)
    text = BeautifulSoup(r.text, 'html.parser').get_text()
    video = text[text.find('baseUrl')-1:text.find('base_url')-2]

    # video = re.findall(r'"baseUrl".*"base_url"', text)[0]
    # video = video.split('"base_url"')[0][:-1]

    video = eval(video.replace('"baseUrl":', ''))
    filename = '%s.flv'%os.path.basename(url)
    with open(filename, "wb") as f:
        f.write(requests.get(url=video,headers=downloadVideoHeaders, stream=True, verify=False).content)

url = 'https://www.bilibili.com/video/av18100312'
download_video(url)

下载简化版(详细信息)

import requests
import os
from bs4 import BeautifulSoup
import json
import re
import warnings

downloadVideoHeaders={
    'Origin': 'https://www.bilibili.com',
    'Referer': 'https://www.bilibili.com/video/av26522634',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}

def download_video(url):
    r = requests.get(url)
    if r.status_code != 200:
        print("error")
        exit(-1)
    text = BeautifulSoup(r.text, 'lxml')
    title = text.find('title').text # or text.find('title').string
    image = re.findall(r"http:.+\.jpg",str(text.find('meta',itemprop="image")))[0]
    uploadDate = re.findall(r"\d+.+\d+", str(text.find('meta', itemprop="uploadDate")))[0]
    view = text.find('span', 'view').text # 播放量
    dm = text.find('span', 'dm').text # 弹幕

    scripts = text.findAll('script')
    for script in scripts:
        if "window.__playinfo__=" in script.text:
            # data = json.loads(script.text.split('=')[1].text)
            # video = eval(re.findall(r'"baseUrl":.+,',script.text)[0])
            # video = eval(re.findall(r'"https://upos.+,":.+,',script.text)[0])
            text = script.text
            video = text[text.find('baseUrl')-1:text.find('base_url')-2]

    video = eval(video.replace('"baseUrl":', ''))
    # filename = '%s.flv'%os.path.basename(url)
    filename = '%s.flv'%title
    with open(filename, "wb") as f:
        f.write(requests.get(url=video,headers=downloadVideoHeaders, stream=True, verify=False).content)

url = 'https://www.bilibili.com/video/av18100312'
download_video(url)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在B站下载指定视频的过程中,可以使用爬虫技术来实现。首先,你需要准备好爬虫工具和相关的编程环境。接下来,你可以使用Python编写爬虫代码来实现下载功能。 一种常见的方法是通过解析B站视频页面的源代码来获取视频的URL链接。你可以使用正则表达式或者BeautifulSoup等库来提取视频的URL链接。在提取URL链接之前,你需要找到视频文件所在的位置,一般是在video标签中。 在提取到视频的URL链接之后,你可以使用Python的下载库来下载视频文件。固定的文件格式可能是MP4或者其他一些常见的视频格式。你可以使用requests库来发送GET请求并下载视频文件。 但需要注意的是,B站对于视频下载有一些防护措施,比如防盗链机制。这意味着你在下载视频时可能需要设置Referer头信息,以绕过防盗链。你可以使用requests库来添加Referer头信息并发送请求。 总结起来,通过编写爬虫代码,你可以解析B站视频页面源代码,提取视频的URL链接,并使用下载下载视频文件,同时需要注意处理防盗链机制。这样就能够实现下载B站指定视频的功能。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* *3* [送书 | 教你下载B站指定视频](https://blog.csdn.net/tongtongjing1765/article/details/120558933)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值