注:由于我进行的并不是规范爬虫,每次爬都是有点心虚,所以下手对象也是一些不法网站QAQ,其中涉及敏感信息,所以就不给出网站URL,哈哈
而且当时是想爬全站,所以逻辑思路有些啰嗦,但后来感觉爬全站没意思,而且电脑容量放不下可能,所以就放弃了,所以这个代码,只能够处理单个页面的,目前
【(2019/2/6 11:09)更新备注:最近更新了谷歌,发现谷歌对于 selenium 给拦截了,所以 selenium 不能启动 Chrome 了,我还没有解决,最近在搭建 SSM 框架,忙~】
import requests
import bs4
from bs4 import BeautifulSoup
import os
import time
from selenium import webdriver
import re
import threading
index_html="http://www.***.com"
http_index="https://***.com"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0'}
dirPath="D:\\Tsdir\\"
tsListPath=r"D:\Tsdir\M3u8_List.txt"
tsList_Path=r"D:\Tsdir\tsList.txt"
tsList=[]
first_List_Url=[]
# def getFirstList():#获取所有一级导航菜单URL,并写入文件
# try:
# r=requests.get(r"http://www.***.com/",headers=header,timeout=20)
# r.raise_for_status()
# r.encoding=r.apparent_encoding
# soup=BeautifulSoup(r.text,"html.parser")
# first_list=soup.select("#menu-main-content a")
# for each in first_list:
# if each["href"]!="/":
# first_List_Url.append(index_html+each["href"])
# print(first_List_Url)
# path=dirPath+"FirstList.txt"
# if not os.path.exists(path):
# with open(path,"w") as f:
# for item in first_List_Url:
# f.write(item+"\n")
# except:
# print("获取一级菜单失败")
#getFirstList()
#EachHtmlOfEachVideoUrl=[]
# def getEachVideoHtml():#获取每一个页面的每一个视频页面的Url,并写入文件
# r=requests.get(r"http://www.***.com/?m=vod-type-id-16.html",timeout=20,headers=header)
# r.encoding=r.apparent_encoding
# r.raise_for_status()
# soup=BeautifulSoup(r.text,"html.parser")
# listOf_a=soup.select(".videos li a")
# # print(listOf_a)
# for item in listOf_a:
# EachHtmlOfEachVideoUrl.append(index_html+item["href"])
# print(EachHtmlOfEachVideoUrl)
# path=dirPath+"EachHtmlOfEachVideoUrl.txt"
# if not os.path.exists(path):
# with open(path,"a+") as f:
# for i in EachHtmlOfEachVideoUrl:
# f.write(i+"\n")
#getEachVideoHtml()
EachVideoDir=dirPath+"firstOne\\"#每一个视频所存大量.ts文件的文件夹。包括:m3u8 协议
#VideoSrcs=[]
#http://www.***.com/?m=vod-play-id-19917-src-1-num-1.html
def getHtml_by_selenium(page_video_src):#获取每一个视频页面html,并提取iframe中的src,将src写入对应视频的文件夹的txt中
try:
browser = webdriver.Chrome()
#browser.maximize_window()
browser.get(page_video_src)
iframe=browser.find_element_by_xpath("//td[@id='playleft']/iframe[2]")
src=iframe.get_attribute("src")
#VideoSrcs.append(src)#将此视频中的src加入到所有src数组中
#追加写入文件
if not os.path.exists(EachVideoDir):#创建每个视频的文件夹
os.mkdir(EachVideoDir)
path=EachVideoDir+"page_src.txt"
if not os.path.exists(path):
with open(path,"w") as f:#只需要写入一条就可以了
f.write(src)
browser.close()
except:
print("申请网站主页面失败")
#getHtml_by_selenium()
def getIframeSrc():
# 获得script标签中的main内容,来制作一个连接,其对应的连接中存在m3u8的真正地址
with open(EachVideoDir + "page_src.txt", "r") as f:
url = f.read()
try:
r = requests.get(url, headers=header, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
script = soup("script", {"type": "text/javascript"})
text = script[0].string
m3u8 = re.compile('var main = ".*?"').findall(text)
m3u8_Url = m3u8[0].split('"')
True_m3u8 = http_index + m3u8_Url[1]
print("获取m3u8后缀地址链接为--")
print(True_m3u8)
return True_m3u8
except:
print("申请iframe src失败")
# print(True_m3u8)
# def getSuffix():
# with open("D:\\Tsdir\\m.txt","r") as f:
# Suffix=f.read()
# return Suffix
def getFileM3u8(True_m3u8):
# 访问并获得真正的m3u8 连接地址
try:
rm = requests.get(True_m3u8, headers=header, timeout=20)
rm.raise_for_status()
Prefix = True_m3u8.split("/")
print("m3u8文件内容为--")
print(rm.text)
#print(Prefix)
Suffix = rm.text.split("\n")[-1]
together_url = Prefix[0] + "//" + Prefix[2] + "/" + Prefix[3] + "/" + Prefix[
4] + "/" + Suffix # 真正的包含大量.ts文件的m3u8连接地址
print("m3u8真实内容链接地址--")
print(together_url)
# 将m3u8连接地址写入.txt文件
if not os.path.exists(EachVideoDir):
os.mkdir(EachVideoDir)
path = EachVideoDir + "m3u8.txt"
if not os.path.exists(path):
with open(path, "w") as f:
f.write(together_url)
return together_url # 返回m3u8地址
except:
#Suffix=getSuffix()
print("申请m3u8文件失败")
def getTs(together_url):
# 访问地址
try:
rr = requests.get(together_url, headers=header, timeout=20)
rr.raise_for_status()
iop = rr.text.split("\n")
if not os.path.exists(EachVideoDir):
os.mkdir(EachVideoDir)
Path = EachVideoDir + "ts.txt" # 写入ts.txt
if not os.path.exists(Path):
with open(Path, "w") as f:
for eachitem in iop:
if "#EXT" in eachitem:
pass
else:
f.write(eachitem + "\n")
except:
print("申请ts列表文件失败")
def getM3u8():#获取真实的m3u8文件,并将其写入所对应的视频的文件的m3u8.txt中
True_m3u8=getIframeSrc()
together_url=getFileM3u8(True_m3u8)
getTs(together_url)
return together_url
def getTsExist():#获取ts文件的每一个小ts文件链接,发挥一个列表
with open(EachVideoDir+"ts.txt","r") as f:
x=f.read().split("\n")
return x
#tss=getTsExist()#获取一个ts列表
def getTsVideo(m3,tss):
little=m3.split("/")
little=little[0]+"//"+little[2]+"/"+little[3]+"/"+little[4]+"/"+little[5]+"/"+little[6]+"/"
eachvideots=[]
for item in tss:
eachvideots.append(little+str(item))
return eachvideots
#m3 = getM3u8()
#eachvideos=getTsVideo(m3)#获取每个ts的真实完全连接
def writeTsToFile(*eachvideos):
if not os.path.exists(EachVideoDir):
os.mkdir(EachVideoDir)
for eachitem in eachvideos:
try:
r=requests.get(eachitem,headers=header,timeout=20,stream=True)
r.raise_for_status()
#处理ts文件存储路径
ss=eachitem.split("/")[-1]
path=EachVideoDir+ss
if not os.path.exists(path):
with open(path,"wb") as f:
f.write(r.content)
except:
continue
#writeTsToFile()
def ThreadsDownload(eachvideos):
lines=5
print("开始下载,线程数为--{}".format(lines))
lenOfV = len(eachvideos)
eachPice=(int)(lenOfV/lines)
vodeo=[]
for index in range(lines-1):
vodeo.append(eachvideos[index*eachPice:(index+1)*eachPice])
vodeo.append(eachvideos[(lines-1)*eachPice:lenOfV])
# 多线程下载
threads = []
for index in vodeo:
thread=threading.Thread(target=writeTsToFile,args=(index))
threads.append(thread)
thread.setDaemon(True)
thread.start()
for item in threads:
item.join()
if __name__ == '__main__':
#Suffix=getSuffix()
#http://www.***.com/?m=vod-play-id-20213-src-1-num-1.html
"""
使用简介:
变量 page_video_src 用来存放电影所在页面地址,上面注解即是例子
爬取小视频存放在D盘Tsdir目录下,使用前需将文件夹清空
爬取完毕后,使用cmd命令将视频合成一个视频
copy /b "D:\Tsdir\firstOne\*.ts" "D:\Tsdir\firstOne\mv.mp4"
"""
page_video_src=r'http://www.***.com/?m=vod-play-id-47871-src-1-num-1.html'
getHtml_by_selenium(page_video_src)#获取iframe src
m3 = getM3u8()
tss=getTsExist()
eachvideos = getTsVideo(m3,tss)
ThreadsDownload(eachvideos)#多线程下载
#writeTsToFile(eachvideos)#单线程下载
print("下载完成")