注:由于我进行的并不是规范爬虫,每次爬都是有点心虚,所以下手对象也是一些不法网站QAQ,其中涉及敏感信息,所以就不给出网站URL,哈哈
import requests
from bs4 import BeautifulSoup
import time
import socket
import os
import re
import bs4
def getText(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header, stream=True)
r.raise_for_status()
return r.content
except:
print("申请视频错误")
def writeFile(path, content):
if not os.path.exists(path):
with open(path, "wb") as file:
file.write(content)
file.flush()
else:
pass
def dealUrl(url, dir):
if not os.path.exists(dir):
os.mkdir(dir)
str = url.split("/")
ss = str[-1].split(".")
mv = str[-2] + "_" + ss[0] + ".mp4"
path = dir + mv
return path
def getHtml(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except:
print("申请网站错误")
return r.status_code
def getMvUrl(html):
soup = BeautifulSoup(html, "html.parser")
a_search = soup("a", href=re.compile("/all/"))
mvs = []
for a in a_search:
if isinstance(a, bs4.element.Tag):
if not a.has_attr('rel') and len(a(lambda x: x.name != None)) != 0:
if a["href"] not in mvs:
mvs.append(a["href"])
return mvs
def dealMvUrl(mvs, url):
strs = url.split("//")
mv_paths = []
for mvitem in mvs:
strs[-1] = mvitem
each = strs[0] + "//" + strs[1] + strs[2]
mv_paths.append(each)
return mv_paths
def getVideoUrl(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header)
r.encoding = r.apparent_encoding
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
video = soup.find("div", {"id": "vod"})
if isinstance(video, bs4.element.Tag):
src = video.string.split("$")
return src[-2]
except:
print("申请视频页面错误")
def getPages(html): # 获取子页面路径
pages = []
soup = BeautifulSoup(html, "html.parser")
menus = soup.find("div", {"class": "menu on"})
uls = menus("a", href=re.compile("/all/"))
for a in uls:
if isinstance(a, bs4.element.Tag):
pages.append(a["href"])
return pages
def dealPageUrl(pages, url): # 规范化每个子页面HTTP申请,爬取第2-12页
abs = []
str = url.split("/")
for page in pages: # 对于每一个从主站获取来的页面,都添加前12页
i = str[0] + "//" + str[2] + page
ii = i.split(".")
for index in range(2, 13):
iii = "{0}.{1}.{2}{3}.{4}".format(ii[0], ii[1], ii[2], index, ii[3])
abs.append(iii)
return abs
if __name__ == '__main__':
dir_url = "D:\\mp4_MV\\" # 存储文件夹路径
url = "http://www.****.com//index.html"
mvUrls = [] # 定义视频路径列表
mv_num = 0
now_op=1
html_main = getHtml(url) # 获取主网站结构
pageUrls = getPages(html_main) # 获取子页面路径
abs_page_url = dealPageUrl(pageUrls, url) # 获取每个页面的http申请#此时就应该获取所有页面申请
for page in abs_page_url: # 依次对每个页面进行操作:
html = getHtml(page) # 获取每个子页面结构
if html == 404: # 处理申请404
print(html)
continue
mvUrls = getMvUrl(html) # 获取每个子页面所有mv相对路径
True_mv_path = dealMvUrl(mvUrls, url) # 获取mv页面HTTP路径
mv_num += len(True_mv_path)
for mvitem in True_mv_path: # 对应每一个页面路径,获得video的src
try:
print("开始第-"+str(now_op)+"-个")
time.sleep(1)
src = getVideoUrl(mvitem)
path = dealUrl(src, dir_url) # 获取每一个mv本地存储路径#并处理文件夹存在与否
content = getText(src)
writeFile(path, content) # 写入本地文件
print("第"+str(now_op)+"-个-this_ok--" + path)
now_op+=1
except:
print("第-"+str(now_op)+"-个-this--" + path + "-出现错误")
now_op+=1
continue
print("all_ok")
print("{:-^20}".format("共有_" + str(mv_num) + "_个视频"))