Python自动获取某二次元网站搜索页所有视频的文案(爬虫加语音识别)

从去年以来,发现网上有很多短视频用的同样的文案渲染着焦虑(也称共享大脑),于是想分析这些消息的传播过程,想要从中挖掘出一些背景信息,由于本人实在是不懂深度学习而且是业余做的,于是只能东拼西凑把前一半凑出来,后面的文本相似度分析只能另想办法了。

为了防止被叔叔发现导致下载方法过时,如果这个文章火了,应该我会删

语音识别用的是fasterwhisperLargeV2该模型的URL好像要科学

6c56e183ea2d41bb96fd14c8f95590a9.png

下载后面那4个文件,放在一个文件夹里,这就是模型的路径

然后要安装CUDA还有cuDNN,去英伟达开发者官网看下就知道了

from urllib.parse import quote, unquote
from selenium import webdriver
import os
import re
import requests
import urllib
import time
import re
import random
from faster_whisper import WhisperModel
from opencc import OpenCC
cc = OpenCC('t2s')
# with open('keywords.txt', 'r', encoding='utf-8') as f:
#     keywords = f.read().split('\n')
#     urlKeywords  = quote(keywords, encoding='utf-8')

while True:

    # 用户交互部分
    t = input("请选择使用模式:\n A.测试模式\n B.批量模式 \n C.单个模式 \n D.退出程序\n 请输入->")

    if t == 'A':
        # 测试模式
        total_page = 8
        break
    elif t == 'B':
        # 批量模式
        total_page = 34
        break
    elif t == 'C':
        # 单个模式
        single_keyword = input("请输入搜索词:")
        total_page = 34
        break
    elif t == 'D':
        # 退出程序
        exit()
    else:
        print("输入错误,请重新输入。\n")

try:
    # 判断keywords.txt是否存在,不存在则创建
    if not os.path.exists('keywords.txt'):
        with open('keywords.txt', 'w', encoding='utf-8') as f:
            f.write('默认搜索词')
        print(
            "keywords.txt文件不存在,已创建。\nkeywords.txt文件用于存储搜索词。\nkeywords.txt文件中每行一个搜索词。\nkeywords.txt只能存在于当前目录,请不要移动或重命名。")
# 虚假user-agent
    agent_list = [
        "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"
    ]

    model_size = "large-v2"

#本来是用的相对写法,但是这样打包的时候老是出问题,所以让用户自己输入模型路径,打包这个东西我不是很了解,老是出错
    path = input("请输入模型路径:")

    # Run on GPU with FP16
    model = WhisperModel(model_size_or_path=path, device="cuda", local_files_only=True)


    # or run on GPU with INT8
    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
    # or run on CPU with INT8
    # model = WhisperModel(model_size, device="cpu", compute_type="int8")

    def get_video_ids(urlKeywords):
        keywords = unquote(urlKeywords, encoding='utf-8')
        print(f"正在获取{keywords}的视频ID...")

        driver = webdriver.Chrome()  # 如果你使用的是Firefox,可以替换为webdriver.Firefox()

        # 遍历1到totla_page页
        for i in range(1, total_page + 1):
            try:
                if i == 1:
                    # 打开一个网页
                    driver.get(f"https://search.bilibili.com/all?keyword={urlKeywords}&search_source=1")
                else:
                    # 打开一个网页
                    driver.get(
                        f"https://search.bilibili.com/all?keyword={urlKeywords}&search_source=1&page={i}&o={30 * i - 30}")
                # 获取页面的HTML内容
                html_content = driver.page_source
            except:
                continue
            # # 关闭浏览器
            # driver.quit()

            # 使用正则表达式查找所有Bilibili视频ID
            video_ids = re.findall(r"//www\.bilibili\.com/video/(BV[a-zA-Z0-9]+)", html_content)

            # 将找到的视频ID保存到文件
            with open("raw_video_ids.temp", "a") as file:
                for video_id in video_ids:
                    file.write(video_id + "\n")
            print(f"keyword:{keywords}, 第{i}页视频ID已经保存到video_ids.temp文件中。")
            # print("视频ID已经保存到video_ids.temp文件中。")


    with open('keywords.txt', 'r', encoding='utf-8') as f:
        if t == 'C':

            urlKeywords = quote(single_keyword, encoding='utf-8')
        else:
            keywords = f.read().split('\n')
            for keyword in keywords:
                urlKeywords = quote(keyword, encoding='utf-8')
                get_video_ids(urlKeywords)


    # 定义一个函数去除文件中的重复行
    def remove_duplicates(file_path, output_path):
        lines_seen = set()  # 使用集合存储已经看到的行
        with open(file_path, "r", encoding="utf-8") as file:
            with open(output_path, "w", encoding="utf-8") as output:
                for line in file:
                    if line not in lines_seen:  # 如果行不在集合中,则写入输出文件
                        output.write(line)
                        lines_seen.add(line)

                        # 使用函数


    remove_duplicates("raw_video_ids.temp", "video_ids.temp")
    print(f"视频ID已经保存到video_ids_{keyword}.temp文件中。")

    headers = {
        'User-Agent': random.choice(agent_list),
    }
    with open('video_ids.temp', 'r') as f:
        BVList = f.read().split('\n')


    def getCidAndTitle(bvid, p=1):
        url = 'https://api.bilibili.com/x/web-interface/view?bvid=' + bvid
        status = requests.get(url, headers=headers).status_code
        print(status)
        data = requests.get(url, headers=headers).json()['data']
        title = data['title']
        cid = data['pages'][p - 1]['cid']
        return str(cid), title

#得到视频的相关信息

    def getInformation(bvList):
        infoList = []
        for bvid in bvList:
            item = []
            if len(bvid) == 12:
                cid, title = getCidAndTitle(bvid)
                item.append(bvid)
            else:
                try:
                    cid, title = getCidAndTitle(bvid[:12], int(bvid[13:]))
                    item.append(bvid[:12])
                except:
                    continue
            item.append(cid)
            item.append(title)
            infoList.append(item)

        return infoList
#接下来搞音频,而不是下载视频因为那样没必要

    def getAudio(infoList):
        baseUrl = 'http://api.bilibili.com/x/player/playurl?fnval=16&'

        for item in infoList:
            st = time.time()
            bvid, cid, title = item[0], item[1], item[2]
            url = baseUrl + 'bvid=' + bvid + '&cid=' + cid

            audioUrl = requests.get(url, headers=headers).json()['data']['dash']['audio'][0]['baseUrl']

            opener = urllib.request.build_opener()
            opener.addheaders = [
                ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0'),
                ('Accept', '*/*'),
                ('Accept-Language', 'en-US,en;q=0.5'),
                ('Accept-Encoding', 'gzip, deflate, br'),
                ('Range', 'bytes=0-'),
                ('Referer', 'https://api.bilibili.com/x/web-interface/view?bvid=' + bvid),  # 注意修改referer,必须要加的!
                ('Origin', 'https://www.bilibili.com'),
                ('Connection', 'keep-alive'),
            ]
            urllib.request.install_opener(opener)
            filename = re.sub(r'[\/:*?"<>|]', '', title + '.mp3')
            urllib.request.urlretrieve(url=audioUrl, filename=filename)
            ed = time.time()
            print(str(round(ed - st, 2)) + ' seconds download finish:', title)
            time.sleep(1)
            segments, info = model.transcribe(filename, beam_size=5, language="zh", vad_filter=True,
                                              vad_parameters=dict(min_silence_duration_ms=1000))
            print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
            with open(f'result{filename}.txt', 'w', encoding='utf-8') as fl:
                fl.write('')
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
                with open(f'result{filename}.txt', 'a', encoding='utf-8') as f:
                    f.write(cc.convert(segment.text) + '\n')
            os.remove(filename)
            print('临时音频文件已删除:', filename)#把识别完了的音频删掉,如果在过程中掐断程序,就会有音频临时文件没用删掉,如果有有心人看到这里,其实还可以再修改一下,每次启动的时候就检测有没有残余的临时文件,然后删掉

'''
不能一口气把34页的视频信息全部扒下来,那样会被封ip,所以设置随机数,随机搞2到6个视频的信息(指cid之类的),然后再下载音频下来,进行语音识别然后再进行下一次。
'''
    if __name__ == '__main__':
        print('Downloader Start!')
        while True:
            lst = []
            x = 0
            for i in range(0, random.randint(2, 7)):
                lst.append(BVList[x + i])
            x += 3
            st = time.time()
            getAudio(getInformation(lst))
            ed = time.time()

except Exception as e:
    print('发生错误:')
    print(e)

input("按任意键退出...")







 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值