python爬虫bilibili_python爬虫下载Bilibili番剧弹幕

本文绍如何利用python爬虫下载bilibili番剧弹幕。

准备:

python3环境

需要安装BeautifulSoup,selenium包

phantomjs

原理:

代码:

# -*- coding: utf-8 -*-

import requests

import json

import urllib.request

import zlib

import os

import re

from bs4 import BeautifulSoup

from urllib.parse import quote

from selenium import webdriver

headers = { 'Accept':'*/*',

'Accept-Encoding':'gzip, deflate, sdch, br',

'Accept-Language':'zh-CN,zh;q=0.8',

'Access-Control-Request-Headers':'content-encoding, content-type, x-za-batch-size, x-za-log-version, x-za-platform, x-za-product',

'Access-Control-Request-Method':'POST',

'Cache-Control':'no-cache',

'Connection':'keep-alive',

'Host':'bilibili-web-analytics.bilibili.com',

'Origin':'https://www.bilibili.com',

'Pragma':'no-cache',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'

}

for key, value in enumerate(headers):

webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = value

driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-windows/bin/phantomjs.exe')

def geAidByKeyword(Keyword):

print('正在搜索,请等待......')

search_url="http://search.bilibili.com/bangumi?keyword="+Keyword

search_url=quote(search_url,safe='/:?=')

search_html = urllib.request.urlopen(search_url)

search_bsObj = BeautifulSoup(search_html,'html.parser')

search_linkList = search_bsObj.findAll("a",{"class":"title"})

count=len(search_linkList)

print('已找到%s个' %count)

i=0

for item in search_linkList:

print('%s:%s' % (i,search_linkList[i]['title']))

i=i+1

# 选择

select=input('请选择你要下载的编号:')

select=int(select)

# print(select)

search_link=search_linkList[select]['href']

if search_link=='':

print('输入无效字符')

return False

# 进入

# print(search_link)

search_link="http:"+search_link

select_link_html = urllib.request.urlopen(search_link)

select_link_bsObj = BeautifulSoup(select_link_html,'html.parser')

select_link_list=select_link_bsObj.findAll("li",{"data-newest-ep-id":re.compile("[0-9]+")})

data_season_id=select_link_list[0]['data-season-id']

data_newest_ep_id=select_link_list[0]['data-newest-ep-id']

new_aid_url="http://bangumi.bilibili.com/anime/%s/play#%s" % (data_season_id,data_newest_ep_id)

getNewAid(new_aid_url)

# print(data_season_id)

def getNewAid(url):

driver.get(url)

html = driver.page_source

bsObj = BeautifulSoup(html,'html.parser')

aid_list=bsObj.findAll("a",{"class":"v-av-link"})

aid=aid_list[0].get_text()

aid=aid[2:]

checkAndDown(aid)

def checkAndDown(aid):

title=getAnimeName(aid)

check=input('你要下载的是%s,是否要下载(1:是;0:否):' %title)

print(check)

if int(check)==1:

getPageList(aid)

else:

return False

def getAnimeName(aid):

# print(aid)

html = urllib.request.urlopen("http://www.bilibili.com/video/av%s" %aid)

bsObj = BeautifulSoup(html,'html.parser')

try:

title=bsObj.find("title").get_text()

except Exception as e:

print('获取失败,aid=%s' %aid)

return False

return title

def getPageList(aid):

url="http://www.bilibili.com/widget/getPageList"

params = {

'aid':aid

}

re = requests.get(url,params)

cidDic=json.loads(re.text)

animeName=getAnimeName(aid)

for cidItem in cidDic:

cidItem['animeName']=animeName

downloadDanmu(cidItem)

def downloadDanmu(cidItem):

cid=cidItem['cid']

animeName=cidItem['animeName']

pagename=cidItem['pagename']

comment_url="http://comment.bilibili.com/%s.xml" %cid

comment_page_zip=urllib.request.urlopen(comment_url).read()

comment=zlib.decompressobj(-zlib.MAX_WBITS).decompress(comment_page_zip)

if os.path.exists('%s' % animeName)==False:

os.makedirs('%s' % animeName)

fout=open('%s/%s.xml' % (animeName,pagename),'wb')

fout.write(comment)

fout.close()

print('%s %s下载完成' % (animeName,pagename))

def main():

# aid=input('请输入aid:')

# getPageList(aid)

keyword=input('请输入番剧的关键字:')

geAidByKeyword(keyword)

if __name__ == '__main__':

main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值