爱奇艺弹幕爬取并提取部分转换为ass字幕

一、从爱奇艺视频页面获取tvid

url = 'https://www.iqiyi.com/v_260uudpmizo.html'
def get_tvid(url):
    html = get_page(url)
    text_list = re.findall(".*?tvid.*?(\d+).*?" ,html)  # 通过正则表达式获取内容 电影为9-10位 电视剧为16位
    return text_list[0]

获取aid

def get_aid(url):
    html = get_page(url)
    text_list = re.findall(".*?aid.*?(\d+).*?" ,html)  # 通过正则表达式获取内容 电影为9-10位 电视剧为16位
    return text_list[0]

电影:直接根据tvid可以获取弹幕连接

电视剧:根据aid获取每集的uid 然后获取弹幕连接并下载。

在豆瓣页面 根据豆瓣api可获取爱奇艺的链接  ,api返回信息包括手机端的链接,手机端链接可以直接获得aid和tvid

下载xml弹幕后转换为ass

下一步:添加原字幕至ass中

1.原字幕为ass

直接提取style 将字幕行追加至结尾

2.源文件为srt 读取字幕转换

#coding=utf-8
#根据爱奇艺播放页面 下载所有弹幕xml
import requests
import re
import json
import zlib
import xml.dom.minidom
import random
import os
import chardet

def get_page(url):
    # 构造请求头部
    headers = {
        'USER-AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    # 发送请求,获得响应
    response = requests.get(url=url, headers=headers)
    # 获得网页源代码
    html = response.text
    # 返回网页源代码
    return html
def get_str_btw(s, f, b):
    par = s.partition(f)
    return (par[2].partition(b))[0][:]


def filename_find(filepath, return_type=0):
    basename = os.path.basename(filepath)
    extension = f'.{basename.split(".")[-1]}'
    extension_lang = f'.{basename.split(".")[-2]}.{basename.split(".")[-1]}'
    filename_without_extension = basename[0:len(basename)-len(extension)]
    if return_type == 0:    # 文件名
        return basename
    if return_type == 1:    # 后缀名
        return extension
    if return_type == 2:    # 无后缀文件名
        return filename_without_extension
    if return_type == 3:    # 后缀名加字幕语言名
        return extension_lang


def get_tvid(url):  # 根据爱奇艺播放页获取tvid
    html = get_page(url)
    text_list = re.findall(".*?tvid.*?(\d+).*?" ,html)  # 通过正则表达式获取内容 电影为9-10位 电视剧为16位
    return text_list[0]
# print(get_tvid(url))


def get_aid(url):   # 根据爱奇艺播放页获取aid
    html = get_page(url)
    text_list = re.findall(".*?aid.*?(\d+).*?" ,html)  # 通过正则表达式获取内容 电影为9-10位 电视剧为16位
    return text_list[0]
# print(get_aid(url))


# 根据aid获取电视剧每一集的tvid
def get_tvid_list(aid):     # 根据aid获取电视剧每一集的tvid
    # tv_id列表
    tv_id_list = []
    url = 'https://pcw-api.iqiyi.com/albums/album/avlistinfo?aid=' \
              + aid + '&page=1' \
              + '&size=999'

    # 请求网页内容
    res = requests.get(url).text
    res_json = json.loads(res)

    # 视频列表
    move_list = res_json['data']['epsodelist']
    for j in move_list:
        tv_id_list.append(j['tvId'])
    return tv_id_list

# print(get_tv_id('8219492629300201'))


# 根据tvid下载所有弹幕文件并转换为xml
def get_bullet(tv_id ,file_path):
    xml_file_list = []
    file_name = filename_find(file_path, 2)
    path = os.path.dirname(file_path)
    folder_path = os.path.join(path ,file_name)
    print(folder_path)
    isExists = os.path.exists(folder_path)
    if not isExists:
        os.makedirs(folder_path)
    for page in range(1, 50):
        # https://cmts.iqiyi.com/bullet/tv_id[-4:-2]/tv_id[-2:]/tv_id_300_x.z
        url = 'https://cmts.iqiyi.com/bullet/' \
              + tv_id[-4:-2] + '/' \
              + tv_id[-2:] + '/' \
              + tv_id + '_300_' \
              + str(page) + '.z'
        print(url)

        # 请求弹幕压缩文件
        res = requests.get(url).content
        res_byte = bytearray(res)
        try:
            xml = zlib.decompress(res_byte).decode('utf-8')

            # 保存路径
            file = folder_path + '/' + tv_id + '_300_' + str(page) + '.xml'
            with open(file, 'w', encoding='utf-8') as f:
                f.write(xml)
            xml_file_list.append(file)
        except:
            return xml_file_list


def seconds_to_time(seconds):
    mm = int(seconds * 100 % 100)
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return ("%d:%02d:%02d.%02d" % (h, m, s, mm))


# 打开xml文档
def xml_to_ass(xml_path, file_path):
    path = os.path.dirname(xml_path)
    file_name = os.path.basename(xml_path)
    #print(path)
    print(file_name)
    page = get_str_btw(file_name,'300_', '.xml')
    print(page)
    f = os.path.dirname(file_path) + "/" + filename_find(file_path, 2) + ".Danish.ass"
    if page == '1':
        try:
            os.remove(f)
        except:
            pass
    dom = xml.dom.minidom.parse(xml_path)
    # 得到文档元素对象
    root = dom.documentElement
    bb = root.getElementsByTagName('list')
    with open(f, "a", encoding='utf-8') as file:
        if page == '1':
            file.write('''[Script Info]
Title: bilibili ASS 弹幕在线转换
Original Script: 根据 251884753.xml 的弹幕信息,由 https://github.com/tiansh/us-danmaku 生成
ScriptType: v4.00+
Collisions: Normal
PlayResX: 560
PlayResY: 420
Timer: 10.0000

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Fix,Microsoft YaHei UI,20,&H66FFFFFF,&H66FFFFFF,&H66000000,&H66000000,1,0,0,0,100,100,0,0,1,2,0,2,20,20,2,0
Style: R2L,Microsoft YaHei UI,20,&H66FFFFFF,&H66FFFFFF,&H66000000,&H66000000,1,0,0,0,100,100,0,0,1,2,0,2,20,20,2,0

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
        ''')
        i = 0
        for item in bb:
            try:
                # f.write('Dialogue: 2,%(start)s,%(end)s,%(styleid)s,,0000,0000,0000,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(c[0]), 'end': ConvertTimestamp(c[0] + duration), 'styles': ''.join(styles), 'text': text, 'styleid': styleid})
                content = item.getElementsByTagName('content')
                # print(content[random.randint(0,len(content)-1)].firstChild.data)
                content_choose = content[random.randint(0, len(content) - 1)].firstChild.data  # 爱奇艺弹幕中每秒有多条 随机从中选取一条弹幕作为展示
                seconds = item.getElementsByTagName('showTime')[0].firstChild.data
                # print(seconds_to_time(float(seconds)+ random.random())) #随机加毫秒以产生间隔
                line = [20, 40, 60, 80]
                style = '\move(%s,%s,%s,%s)' % (random.randint(560, 760), line[i], -random.randint(10, 100), line[i])
                # file.write('Dialogue: 0,%(start)s,%(end)s,%(styleid)s,,20,20,2,,{%(styles)s}%(text)s\n' % {'start': seconds_to_time(float(seconds)+ random.random()), 'end': seconds_to_time(float(seconds)+ random.random() + 8), 'styles': styles[i], 'text': item.getElementsByTagName('content')[0].firstChild.data, 'styleid': 'Fix'})
                file.write('Dialogue: 0,%(start)s,%(end)s,%(styleid)s,,20,20,2,,{%(styles)s}%(text)s\n' % {
                    'start': seconds_to_time(float(seconds) + random.random()),
                    'end': seconds_to_time(float(seconds) + random.random() + 8), 'styles': style, 'text': content_choose,
                    'styleid': 'Fix'})
                i = i + 1
                if i > 3:
                    i = 0
            except:
                pass
    os.remove(xml_path)
    try:
        os.rmdir(path)
    except:
        pass


def get_codetype(ass_name): #获取字幕编码格式
    f = open(ass_name,'rb')  # 先用二进制打开
    data = f.read()  # 读取文件内容
    file_encoding = chardet.detect(data).get('encoding')  # 得到文件的编码格式
    f.close
    return file_encoding


# 读取ass文件
def read_ass_file(ass_name,codetype):
    # function: read the ass file into python
    # input: filename
    # output:
    if codetype is None:
        codetype = 'utf-8'
    f_ass = open(ass_name,'r',encoding=codetype)
    subtitle = f_ass.readlines()
    f_ass.close()
    new_subtitle = []
    for i in range(len(subtitle)):
        if "[Events]" in subtitle[i]:
            #print("[Events]:from {}th line".format(i + 1))
            new_subtitle = subtitle[i+1:]
            break
    return new_subtitle


def read_ass_style(ass_name,codetype):
    # function: read the ass file into python
    # input: filename
    # output:
    if codetype is None:
        codetype = 'utf-8'
    style = []
    with open(ass_name, "r", encoding=codetype) as file:
        subtitle = file.readlines()
        # print(subtitle)
        for i in range(len(subtitle)):
            if "[V4+ Styles]" in subtitle[i]:
                start = i
            if "[Events]" in subtitle[i]:
                end = i
                break
        for line in subtitle[start:end]:
            if "Style:" in line:
                style.append(line)
        return ''.join(style)


# 在ass字幕的sytle位置追加字幕sytle
def addline_to_danmu(danmu_name, codetype, text):
    # function: read the ass file into python
    # input: filename
    # output:
    if codetype is None:
        codetype = 'utf-8'
    with open(danmu_name, "r", encoding=codetype) as file:
        subtitle = file.readlines()
        #print(subtitle)
        for i in range(len(subtitle)):
            if "[Events]" in subtitle[i]:
                subtitle.insert(i-1, text + "\n")
                break
        s = ''.join(subtitle)
    with open(danmu_name, 'w', encoding=codetype) as f:  # 写文件,开始的时候会先清空原文件,参考w的用法。如果不用with open,只是open,要注意最后将文件关闭。
        f.write(s)
def addass_to_danmu(danmu_name, codetype, ass_text):
    if codetype is None:
        codetype = 'utf-8'
    with open(danmu_name, 'a', encoding=codetype) as f:  # 写文件,开始的时候会先清空原文件,参考w的用法。如果不用with open,只是open,要注意最后将文件关闭。
        f.writelines(ass_text)


def add_ass_to_danmu(danmu_name,ass_name):
    style_text = read_ass_style(ass_name, get_codetype(ass_name))
    addline_to_danmu(danmu_name, get_codetype(danmu_name), style_text)
    ass_text = read_ass_file(ass_name, get_codetype(ass_name))
    addass_to_danmu(danmu_name, get_codetype(danmu_name), ass_text)

def aiqiyi_bullet(tvid, file_path):
    danmu_name = os.path.dirname(file_path) + "/" + filename_find(file_path, 2) + ".Danish.ass"
    #tvid = get_tvid(url)
    bullet_list = get_bullet(tvid, file_path)
    for bullet in bullet_list:
        xml_to_ass(bullet, file_path)
    return danmu_name


if __name__ == "__main__":
    # 获取tvid的方法:如果是豆瓣api获取的uri,则直接提取tvid,如果是爱奇艺链接,则用get_tvid(url)
    # 获取aid的方法:如果是豆瓣api获取的uri,则直接提取aid,如果是爱奇艺链接,则用get_aid(url)
    #url = 'https://www.iqiyi.com/v_jeqaw0xe84.html?vfm=2008_aldbd&fv=p_02_01#curid=2099769377685800_a0c43aa83fd0a2422e3e381d70407527'   # 爱奇艺电影地址
    #file_path = r'Y:\电视剧\赘婿\season1\赘婿.My.Heroic.Husband.2021.1080p.WEB-DL.H264.AAC-LeagueWEB\My.Heroic.Husband.2021.E20.1080p.WEB-DL.H264.AAC-LeagueWEB.mp4'
    #get_tvid(url)
    #aiqiyi_bullet(tvid,file_path)
    uri = "iqiyi://mobile/player?aid=225041201&tvid=9749815600&ftype=27&subtype=333"
    #tvid = get_str_btw(uri, 'tvid=', '&')
    #aid = get_str_btw(uri, 'aid=', '&')
    #aiqiyi_bullet(tvid, file_path)
    aid = '212447801'
    tvid_list = get_tvid_list(aid)
    print(tvid_list)

 

相关推荐
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页