注:此博客无任何教程,只有代码和部分注释,博主自己看的!刚学习python三天,不喜勿喷
功能:
1.解析首页资源
2.解析首页对应标题下的资源页面链接
3.自动解析每个资源链接的.ts数量
4.自动解析每个资源对应的.m3u8资源(用于分析片段数量)
5.按标题分类存储
代码仅提供爬虫实现的思路,无法复用
code:
import multiprocessing
import os
import string
import requests
import re
from bs4 import BeautifulSoup
from multiprocessing import Pool
# 头部信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# m3u8文件链接的前缀(所有链接前缀一样,所以提取出来)
m3u8path = "https://xxx.xxx.com/"
# 每个资源链接前缀(所有链接前缀一样,所以提取出来)
repath = "https://xxx.xxx.de/"
# 请求首页,并解析html拿到所有资源页链接
response = requests.get("https://www.xxx.xx/xxx.html", headers=headers)
# 设置字符编码
response.encoding = "utf-8"
# 获取首页html
syHtml = response.text
soup = BeautifulSoup(syHtml, "lxml")
# 解析首页html(使用css选择器定位元素)
resourceList = soup.select("a[class='video-pic loading']")
# 将首页所有的资源链接保存在matrix数组中
matrix = []
#将所有元素的href标签内容提取出来
for i in range(0, len(resourceList)):
matrix.insert(i, resourceList[i].get("href"))
for url in matrix:
response1 = requests.get(repath+url, headers=headers)
response1.encoding="utf-8"
#得到每个资源链接的html内容
rehtml=response1.text
#以下代码解析资源页面的m3u8链接
soup1 = BeautifulSoup(rehtml, "lxml")
reinfo = soup1.select_one("#vpath")
# 以下代码得到每个视频的标题
title = soup1.select_one(".player_title>h1").text.split()
#请求m3u8,计算得到ts文件数量
response2 = requests.get(m3u8path+reinfo.text.strip(), headers=headers)
response2.encoding = "utf-8"
count = response2.text.count("ts")
#以下代码解析ts链接
flag = 0
num = 0
while 1 == 1:
try:
k = (m3u8path + reinfo.text.strip()).index("/", num)
num = k+1
except:
flag=num
break
#解析出url前缀
vedioUrlPre = (m3u8path + reinfo.text.strip())[0:int(flag)]
#爬取当前资源
for i in range(0,count):
vedioUrl = vedioUrlPre + "%04d.ts" % i
response3 = requests.get(vedioUrl, headers=headers)
dir = "D:\\爬虫\\" + str(title)
if not os.path.exists(dir):
os.makedirs(dir)
file = open(dir+"\\{}".format(vedioUrl[-7:]), "wb")
print("开始写入资源:"+ url +" 的第" + str(i)+"个片段")
file.write(response.content)
print("写入片段" + str(i) + "结束\n")
file.close()
print("所有视频爬取完毕!!!")