python网络爬虫
利用python获取b站视频封面及弹幕
获取弹幕
从https://api.bilibili.com/x/v1/dm/list.so?oid=286266029k可以看到弹幕列表
该网页为xml格式,利用etree.parser,xpath解析获取弹幕列表
从https://api.bilibili.com/x/player/pagelist?bvid=BV1eh41127Ma&jsonp=jsonp中找到了oid(/cid)
利用
data = json.loads(req.text)
对返回解析,得到一个python字典对象?该字典data项为一列表,该列表cid项即为cid
data = data[‘data’][0][‘cid’]
def jx(aid, oid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + oid
head = {
'Host': 'api.bilibili.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.92 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': ''
}
rp = requests.get(url, head)
with open('bilibili.xml', 'wb') as f:
f.write(rp.content)
ht = etree.parse('bilibili.xml', etree.HTMLParser())
results = ht.xpath('//d//text()')
with open(path.get() + '/' + 'bilibili' + aid + '.txt', 'w', encoding='utf-8') as f:
for i in range(len(results)):
f.write(results[i] + '\n')
# f.write(bytes(results,encoding='utf-8'))
print(results)
def get_oid(id,kind):
creat_path()
if(kind == 'av'):
abu = 'https://api.bilibili.com/x/player/pagelist?aid=' + str(id) + '&jsonp=jsonp'
else:
abu = 'https://api.bilibili.com/x/player/pagelist?bvid=' + str(id) + '&jsonp=jsonp'
h = {
'Host': 'api.bilibili.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.92 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': ''
}
req = requests.get(abu, h)
data = json.loads(req.text)
# print(req.text)
data = data['data'][0]['cid']
# print(data)
# return data
jx(id, str(data))
获取封面
于视频主页的html中的head可以看到图片链接地址。
(部分视频有请求错误,原因可能是请求头有问题。b反扒机制)
def imaged(bv,kind):
#获取封面,
creat_path()
#urlb = 'https://search.bilibili.com/all?keyword=' +bv +'&from_source=nav_search_new'
if (kind == 'av'):
url = 'https://www.bilibili.com/video/av' + bv + '/'
else:
url = 'https://www.bilibili.com/video/BV' + bv + '/'
h = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
resp = requests.get(url, h)
#print(resp.text)
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
# links = soup.find_all('head',meta = '')
for i in soup.find_all('meta', itemprop="image"):
#print(i[0])
# print(i['content'])
p_url = i['content']
#print('Urlll' + p_url)
#print(path.get() + '/' + bv + p_url[p_url.rfind('.'):])
urllib.request.urlretrieve(p_url, path.get() + '/' + bv + p_url[p_url.rfind('.'):])
完整代码
import requests
from lxml import etree
import json
import bs4
import urllib.request
import tkinter
from tkinter import filedialog
from tkinter import ttk
import os
def creat_path():
if not os.path.exists(path.get()):
os.makedirs(path.get())
def jx(aid, oid):
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + oid
head = {
'Host': 'api.bilibili.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.92 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': ''
}
rp = requests.get(url, head)
with open('bilibili.xml', 'wb') as f:
f.write(rp.content)
ht = etree.parse('bilibili.xml', etree.HTMLParser())
results = ht.xpath('//d//text()')
with open(path.get() + '/' + 'bilibili' + aid + '.txt', 'w', encoding='utf-8') as f:
for i in range(len(results)):
f.write(results[i] + '\n')
# f.write(bytes(results,encoding='utf-8'))
print(results)
def get_oid(id,kind):
creat_path()
if(kind == 'av'):
# 获取视频oid,根据av,bv不同,从不同链接地址获取
abu = 'https://api.bilibili.com/x/player/pagelist?aid=' + str(id) + '&jsonp=jsonp'
else:
abu = 'https://api.bilibili.com/x/player/pagelist?bvid=' + str(id) + '&jsonp=jsonp'
h = {
'Host': 'api.bilibili.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.92 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': ''
}
req = requests.get(abu, h)
data = json.loads(req.text)
# print(req.text)
data = data['data'][0]['cid']
# print(data)
# return data
jx(id, str(data))
def imaged(bv,kind):
#获取封面,
creat_path()
#urlb = 'https://search.bilibili.com/all?keyword=' +bv +'&from_source=nav_search_new'
if (kind == 'av'):
url = 'https://www.bilibili.com/video/av' + bv + '/'
else:
url = 'https://www.bilibili.com/video/BV' + bv + '/'
h = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
resp = requests.get(url, h)
#print(resp.text)
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
# links = soup.find_all('head',meta = '')
for i in soup.find_all('meta', itemprop="image"):
#print(i[0])
# print(i['content'])
p_url = i['content']
#print('Urlll' + p_url)
#print(path.get() + '/' + bv + p_url[p_url.rfind('.'):])
urllib.request.urlretrieve(p_url, path.get() + '/' + bv + p_url[p_url.rfind('.'):])
def sel_path():
path_ = tkinter.filedialog.askdirectory()
path.set(path_)
if __name__ == '__main__':
root = tkinter.Tk()
root.geometry('250x150')
path = tkinter.StringVar(root, value="D:/APC")
LessonChosen = tkinter.ttk.Combobox(width=3,state='readonly')
LessonChosen['values'] = ('BV','av') # 设置下拉列表的值
LessonChosen.grid(column=2, row=0) # 设置其在界面中出现的位置 column代表列 row 代表行
LessonChosen.current(0)
lain = tkinter.Label(text='视频号')
buttonDm = tkinter.Button(root, text='弹幕', command=lambda: get_oid(inid.get(),LessonChosen.get()))
inid = tkinter.Entry()
buttonFm = tkinter.Button(root, text='封面', command=lambda: imaged(inid.get(),LessonChosen.get()))
lain.grid(row=0, column=1)
inid.grid(row=0, column=0)
buttonDm.grid(row=5, column=0)
buttonFm.grid(row=5, column=1)
tkinter.Entry(textvariable=path).grid(row=4, column=0)
tkinter.Button(text='路径选择', command=sel_path).grid(row=4, column=1)
root.mainloop()