python爬虫对于短视频爬取的案解析
案件如下:url 为 https://www.pearvideo.com/
# -*- coding: utf-8 -*-
import requests
import re
# srcUrl="(.*?)",vdoUrl
# https://video.pearvideo.com/mp4/short/20200108/cont-1640135-14776159-hd.mp4
# https://www.pearvideo.com/video_1640135
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
}
url = "https://www.pearvideo.com/"
# 把视频下载封装下 提供 url 视频链接 path 保存路径
def download_file(url, path):
with requests.get(url, stream=True) as r:
chunk_size = 1024
content_size = int(r.headers['content-length'])
print("开始下载...")
with open(path, "wb") as f:
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
# 获取首页url 的内容为 下面-- request
req = requests.get(url=url,headers=headers) # 只爬取一页短视频 想要获取多页 参考上一篇音乐资源的爬取
request = req.text
# print(req.text)
list_urls = []
# 正则匹配 所需单个视频的一段 为下面拼接做前提
pat1 = 'href="(.*?)" class="vervideo-lilink actplay"'
list_url = re.findall(pat1,request)
list_urls.extend(list_url)
video = None
for i in range(0,len(list_urls)): # 如 上篇文章一样 遍历 然后拼接视频url地址
video = i
new_path = "D:\\video\\{}.mp4".format(video)
new_url = url+list_urls[i]
reqs = requests.get(url=new_url,headers=headers)
new_url_request = reqs.text
pat2 = 'srcUrl="(.*?)",vdoUrl'
last_url = re.findall(pat2,new_url_request)[0]
download_file(last_url, new_path) # 调用 上面封装的函数 下载视频
print(last_url)
爬取结果如图所示
希望对你有所启发 有疑问 欢迎评论