从去年以来,发现网上有很多短视频用的同样的文案渲染着焦虑(也称共享大脑),于是想分析这些消息的传播过程,想要从中挖掘出一些背景信息,由于本人实在是不懂深度学习而且是业余做的,于是只能东拼西凑把前一半凑出来,后面的文本相似度分析只能另想办法了。
为了防止被叔叔发现导致下载方法过时,如果这个文章火了,应该我会删
语音识别用的是fasterwhisperLargeV2该模型的URL好像要科学

下载后面那4个文件,放在一个文件夹里,这就是模型的路径
然后要安装CUDA还有cuDNN,去英伟达开发者官网看下就知道了
from urllib.parse import quote, unquote
from selenium import webdriver
import os
import re
import requests
import urllib
import time
import re
import random
from faster_whisper import WhisperModel
from opencc import OpenCC
cc = OpenCC('t2s')
# with open('keywords.txt', 'r', encoding='utf-8') as f:
# keywords = f.read().split('\n')
# urlKeywords = quote(keywords, encoding='utf-8')
while True:
# 用户交互部分
t = input("请选择使用模式:\n A.测试模式\n B.批量模式 \n C.单个模式 \n D.退出程序\n 请输入->")
if t == 'A':
# 测试模式
total_page = 8
break
elif t == 'B':
# 批量模式
total_page = 34
break
elif t == 'C':
# 单个模式
single_keyword = input("请输入搜索词:")
total_page = 34
break
elif t == 'D':
# 退出程序
exit()
else:
print("输入错误,请重新输入。\n")
try:
# 判断keywords.txt是否存在,不存在则创建
if not os.path.exists('keywords.txt'):
with open('keywords.txt', 'w', encoding='utf-8') as f:
f.write('默认搜索词')
print(
"keywords.txt文件不存在,已创建。\nkeywords.txt文件用于存储搜索词。\nkeywords.txt文件中每行一个搜索词。\nkeywords.txt只能存在于当前目录,请不要移动或重命名。")
# 虚假user-agent
agent_list = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"
]
model_size = "large-v2"
#本来是用的相对写法,但是这样打包的时候老是出问题,所以让用户自己输入模型路径,打包这个东西我不是很了解,老是出错
path = input("请输入模型路径:")
# Run on GPU with FP16
model = WhisperModel(model_size_or_path=path, device="cuda", local_files_only=True)
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
def get_video_ids(urlKeywords):
keywords = unquote(urlKeywords, encoding='utf-8')
print(f"正在获取{keywords}的视频ID...")
driver = webdriver.Chrome() # 如果你使用的是Firefox,可以替换为webdriver.Firefox()
# 遍历1到totla_page页
for i in range(1, total_page + 1):
try:
if i == 1:
# 打开一个网页
driver.get(f"https://search.bilibili.com/all?keyword={urlKeywords}&search_source=1")
else:
# 打开一个网页
driver.get(
f"https://search.bilibili.com/all?keyword={urlKeywords}&search_source=1&page={i}&o={30 * i - 30}")
# 获取页面的HTML内容
html_content = driver.page_source
except:
continue
# # 关闭浏览器
# driver.quit()
# 使用正则表达式查找所有Bilibili视频ID
video_ids = re.findall(r"//www\.bilibili\.com/video/(BV[a-zA-Z0-9]+)", html_content)
# 将找到的视频ID保存到文件
with open("raw_video_ids.temp", "a") as file:
for video_id in video_ids:
file.write(video_id + "\n")
print(f"keyword:{keywords}, 第{i}页视频ID已经保存到video_ids.temp文件中。")
# print("视频ID已经保存到video_ids.temp文件中。")
with open('keywords.txt', 'r', encoding='utf-8') as f:
if t == 'C':
urlKeywords = quote(single_keyword, encoding='utf-8')
else:
keywords = f.read().split('\n')
for keyword in keywords:
urlKeywords = quote(keyword, encoding='utf-8')
get_video_ids(urlKeywords)
# 定义一个函数去除文件中的重复行
def remove_duplicates(file_path, output_path):
lines_seen = set() # 使用集合存储已经看到的行
with open(file_path, "r", encoding="utf-8") as file:
with open(output_path, "w", encoding="utf-8") as output:
for line in file:
if line not in lines_seen: # 如果行不在集合中,则写入输出文件
output.write(line)
lines_seen.add(line)
# 使用函数
remove_duplicates("raw_video_ids.temp", "video_ids.temp")
print(f"视频ID已经保存到video_ids_{keyword}.temp文件中。")
headers = {
'User-Agent': random.choice(agent_list),
}
with open('video_ids.temp', 'r') as f:
BVList = f.read().split('\n')
def getCidAndTitle(bvid, p=1):
url = 'https://api.bilibili.com/x/web-interface/view?bvid=' + bvid
status = requests.get(url, headers=headers).status_code
print(status)
data = requests.get(url, headers=headers).json()['data']
title = data['title']
cid = data['pages'][p - 1]['cid']
return str(cid), title
#得到视频的相关信息
def getInformation(bvList):
infoList = []
for bvid in bvList:
item = []
if len(bvid) == 12:
cid, title = getCidAndTitle(bvid)
item.append(bvid)
else:
try:
cid, title = getCidAndTitle(bvid[:12], int(bvid[13:]))
item.append(bvid[:12])
except:
continue
item.append(cid)
item.append(title)
infoList.append(item)
return infoList
#接下来搞音频,而不是下载视频因为那样没必要
def getAudio(infoList):
baseUrl = 'http://api.bilibili.com/x/player/playurl?fnval=16&'
for item in infoList:
st = time.time()
bvid, cid, title = item[0], item[1], item[2]
url = baseUrl + 'bvid=' + bvid + '&cid=' + cid
audioUrl = requests.get(url, headers=headers).json()['data']['dash']['audio'][0]['baseUrl']
opener = urllib.request.build_opener()
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0'),
('Accept', '*/*'),
('Accept-Language', 'en-US,en;q=0.5'),
('Accept-Encoding', 'gzip, deflate, br'),
('Range', 'bytes=0-'),
('Referer', 'https://api.bilibili.com/x/web-interface/view?bvid=' + bvid), # 注意修改referer,必须要加的!
('Origin', 'https://www.bilibili.com'),
('Connection', 'keep-alive'),
]
urllib.request.install_opener(opener)
filename = re.sub(r'[\/:*?"<>|]', '', title + '.mp3')
urllib.request.urlretrieve(url=audioUrl, filename=filename)
ed = time.time()
print(str(round(ed - st, 2)) + ' seconds download finish:', title)
time.sleep(1)
segments, info = model.transcribe(filename, beam_size=5, language="zh", vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=1000))
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
with open(f'result{filename}.txt', 'w', encoding='utf-8') as fl:
fl.write('')
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
with open(f'result{filename}.txt', 'a', encoding='utf-8') as f:
f.write(cc.convert(segment.text) + '\n')
os.remove(filename)
print('临时音频文件已删除:', filename)#把识别完了的音频删掉,如果在过程中掐断程序,就会有音频临时文件没用删掉,如果有有心人看到这里,其实还可以再修改一下,每次启动的时候就检测有没有残余的临时文件,然后删掉
'''
不能一口气把34页的视频信息全部扒下来,那样会被封ip,所以设置随机数,随机搞2到6个视频的信息(指cid之类的),然后再下载音频下来,进行语音识别然后再进行下一次。
'''
if __name__ == '__main__':
print('Downloader Start!')
while True:
lst = []
x = 0
for i in range(0, random.randint(2, 7)):
lst.append(BVList[x + i])
x += 3
st = time.time()
getAudio(getInformation(lst))
ed = time.time()
except Exception as e:
print('发生错误:')
print(e)
input("按任意键退出...")

被折叠的 条评论
为什么被折叠?



