仅供学习交流使用,如有侵权,联系删除。
python 爬虫抓取百度百科视频源代码
from urllib.parse import quote
from bs4 import BeautifulSoup
import requests
import re
#抓取secondId的头
header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.98Safari/537.36',
}
#抓取视频的头
header2={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host':'baike.baidu.com',
'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.98Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
}
#得到该名字的id
def getSecondId(name):
url="https://baike.baidu.com/item/"+quote(name)
r=requests.get(headers=header,url=url)
r.encoding = "utf-8"
bs = BeautifulSoup(r.text, 'lxml')
# print(bs)
bsStr=str(bs)
ret=re.findall('(\"secondId\":[\d]{0,10})',bsStr)
print(ret[1])
ret2=re.sub('"secondId":',"",str(ret[1]))
return ret2
#获取视频链接
def getUrl(secondId):
url="https://baike.baidu.com/api/wikisecond/playurl?secondId="+secondId+"&t=1549201158256&_243463=1549201151114"
r=requests.get(headers=header2,url=url)
r2=r.content
r3=str(r2)
ret3=r3.split(",")
ret4=ret3[2]
ret5=re.sub("\"list\":{\"mp4Url\":","",ret4)
ret6=re.sub("[\"]","",ret5)
# print(ret6)
ret7=re.sub(r'\\',"",ret6)
print(ret7)
url=ret7
return url
#下载视频
def download(url,name):
print('start')
# r=requests.get(headers=headers,url=url)
r=requests.get(url,stream=True)
with open(name+'.mp4', "wb") as mp4:
for chunk in r.iter_content(chunk_size=1024*1024):
if chunk:
mp4.write(chunk)
print('download over')
name="杜鹃"
secondId=getSecondId(name)
url=getUrl(secondId)
download(url,name)
我在这里给大家一个百度云链接,提取码x62s。这个里面有可以直接在windows上运行的工具。可能界面不太友好。原谅我的美术功底。