学习笔记(4)[Python爬虫]:爬取B站搜索界面的所有视频的封面
import os
import requests
import re
import json
from bs4 import BeautifulSoup
headers = {
'Host': 'search.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': "sid=9929ha37; _uuid=FC54B86B-0908-ABD2-3477-79EDAABB172C51786infoc; buvid3=293ACCA2-296A-4737-9E10-FDB90441803453950infoc; DedeUserID=5200237; DedeUserID__ckMd5=0edb78f23ca63f84; SESSDATA=b1fd4a18%2C1602645457%2C7d0e9*41; bili_jct=e57309d60e6d25f7c0da09fbf3c84007; PVID=2; CURRENT_FNVAL=16; LIVE_BUVID=AUTO8915871711402137; rpdid=|(umu)R~RkRJ0J'ul)~u~umuR; bp_t_offset_5200237=384977950470039482; dy_spec_agreed=1; bsource=seo_baidu; arrange=matrix",
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
'TE': 'Trailers'
}
table='fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
tr={}
for i in range(58):
tr[table[i]]=i
s=[11,10,3,8,4,6]
xor=177451812
add=8728348608
def dec(x):
r=0
for i in range(6):
r+=tr[x[s[i]]]*58**i
return (r-add)^xor
def enc(x):
x=(x^xor)+add
r=list('BV1 4 1 7 ')
for i in range(6):
r[s[i]]=table[x//58**i%58]
return ''.join(r)
def downloadCover(bvid):
avid = str(dec(bvid))
print(avid)
videoUrl = 'https://api.bilibili.com/x/web-interface/view?aid=' + avid
videoInfo = requests.get(videoUrl)
videoJson = json.loads(videoInfo.text)
picSrc = videoJson['data']['pic']
imgInfo = requests.get(picSrc)
if not os.path.exists('./image'):
os.mkdir('./image')
with open('./image/' + str(bvid) + '.jpg','wb') as f:
f.write(imgInfo.content)
if __name__ == '__main__':
print("请输入搜索关键词")
keyword = str(input())
print("请输入查询页面的范围")
pageRange = input().split()
pageMin = int(pageRange[0])
pageMax = int(pageRange[1])
for index in range(pageMin,pageMax+1):
url = 'https://search.bilibili.com/all?keyword=' + keyword + \
'&page=' + str(index)
print(url)
htmlInfo = requests.get(url, headers = headers)
soup = BeautifulSoup(htmlInfo.text, 'html.parser')
print(soup)
videoList = soup.find_all(attrs={'href':re.compile('BV')})
print(videoList)
bvidList= list()
for item in videoList:
href = str(item.get('href'))
print(href)
bvid = re.findall(r'BV(.+?)\?',href)
bvidList.append('BV' + bvid[0])
downloadCover('BV' + bvid[0])
print(bvidList)
本文禁止转载或摘编
--
--
--
分享到:
投诉或建议
评论